Fit the P/NBD Model to External Data
1 Load Transactional Datasets
We first want to load the real-world transactional dataset.
1.1 Load Pre-processed Transactional Data
customer_cohortdata_tbl <- read_rds("data/customer_cohort_tbl.rds")
customer_cohortdata_tbl %>% glimpse()## Rows: 5,852
## Columns: 5
## $ customer_id [3m[38;5;246m<chr>[39m[23m "12346", "12347", "12348", "12349", "12350", "12351", …
## $ cohort_qtr [3m[38;5;246m<yearqtr>[39m[23m 2010 Q1, 2010 Q4, 2010 Q3, 2010 Q2, 2011 Q1, 2010 …
## $ cohort_ym [3m[38;5;246m<chr>[39m[23m "2010 03", "2010 10", "2010 09", "2010 04", "2011 02",…
## $ first_tnx_date [3m[38;5;246m<date>[39m[23m 2010-03-02, 2010-10-31, 2010-09-27, 2010-04-29, 2011-…
## $ total_tnx_count [3m[38;5;246m<int>[39m[23m 3, 8, 5, 3, 1, 1, 9, 2, 1, 2, 6, 2, 5, 10, 6, 4, 10, 2…
We also want to load the raw transaction data as we want to transform the data into a form we now use.
retail_transaction_data_tbl <- read_rds("data/retail_data_cleaned_tbl.rds")
retail_transaction_data_tbl %>% glimpse()## Rows: 1,021,424
## Columns: 23
## $ row_id [3m[38;5;246m<chr>[39m[23m "ROW0000001", "ROW0000002", "ROW0000003", "ROW000000…
## $ excel_sheet [3m[38;5;246m<chr>[39m[23m "Year 2009-2010", "Year 2009-2010", "Year 2009-2010"…
## $ invoice_id [3m[38;5;246m<chr>[39m[23m "489434", "489434", "489434", "489434", "489434", "4…
## $ stock_code [3m[38;5;246m<chr>[39m[23m "85048", "79323P", "79323W", "22041", "21232", "2206…
## $ description [3m[38;5;246m<chr>[39m[23m "15CM CHRISTMAS GLASS BALL 20 LIGHTS", "PINK CHERRY …
## $ quantity [3m[38;5;246m<dbl>[39m[23m 12, 12, 12, 48, 24, 24, 24, 10, 12, 12, 24, 12, 10, …
## $ invoice_date [3m[38;5;246m<date>[39m[23m 2009-12-01, 2009-12-01, 2009-12-01, 2009-12-01, 200…
## $ price [3m[38;5;246m<dbl>[39m[23m 6.95, 6.75, 6.75, 2.10, 1.25, 1.65, 1.25, 5.95, 2.55…
## $ customer_id [3m[38;5;246m<chr>[39m[23m "13085", "13085", "13085", "13085", "13085", "13085"…
## $ country [3m[38;5;246m<chr>[39m[23m "United Kingdom", "United Kingdom", "United Kingdom"…
## $ stock_code_upr [3m[38;5;246m<chr>[39m[23m "85048", "79323P", "79323W", "22041", "21232", "2206…
## $ cancellation [3m[38;5;246m<lgl>[39m[23m FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
## $ invoice_dttm [3m[38;5;246m<dttm>[39m[23m 2009-12-01 07:45:00, 2009-12-01 07:45:00, 2009-12-0…
## $ invoice_month [3m[38;5;246m<chr>[39m[23m "December", "December", "December", "December", "Dec…
## $ invoice_dow [3m[38;5;246m<chr>[39m[23m "Tuesday", "Tuesday", "Tuesday", "Tuesday", "Tuesday…
## $ invoice_dom [3m[38;5;246m<chr>[39m[23m "01", "01", "01", "01", "01", "01", "01", "01", "01"…
## $ invoice_hour [3m[38;5;246m<chr>[39m[23m "07", "07", "07", "07", "07", "07", "07", "07", "07"…
## $ invoice_minute [3m[38;5;246m<chr>[39m[23m "45", "45", "45", "45", "45", "45", "45", "45", "45"…
## $ invoice_woy [3m[38;5;246m<chr>[39m[23m "49", "49", "49", "49", "49", "49", "49", "49", "49"…
## $ invoice_ym [3m[38;5;246m<chr>[39m[23m "200912", "200912", "200912", "200912", "200912", "2…
## $ stock_value [3m[38;5;246m<dbl>[39m[23m 83.40, 81.00, 81.00, 100.80, 30.00, 39.60, 30.00, 59…
## $ invoice_monthprop [3m[38;5;246m<dbl>[39m[23m 0.04347826, 0.04347826, 0.04347826, 0.04347826, 0.04…
## $ exclude [3m[38;5;246m<lgl>[39m[23m FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FAL…
We need to aggregate this data up into a form to match our synthetic data, so
we aggregate transactions by invoice_id.
customer_transactions_tbl <- retail_transaction_data_tbl %>%
drop_na(customer_id) %>%
filter(exclude = TRUE) %>%
group_by(tnx_timestamp = invoice_dttm, customer_id, invoice_id) %>%
summarise(
.groups = "drop",
total_spend = sum(stock_value)
) %>%
filter(total_spend > 0) %>%
arrange(tnx_timestamp, customer_id)
customer_transactions_tbl %>% glimpse()## Rows: 37,031
## Columns: 4
## $ tnx_timestamp [3m[38;5;246m<dttm>[39m[23m 2009-12-01 07:45:00, 2009-12-01 07:45:59, 2009-12-01 09…
## $ customer_id [3m[38;5;246m<chr>[39m[23m "13085", "13085", "13078", "15362", "18102", "12682", "1…
## $ invoice_id [3m[38;5;246m<chr>[39m[23m "489434", "489435", "489436", "489437", "489438", "48943…
## $ total_spend [3m[38;5;246m<dbl>[39m[23m 505.30, 145.80, 630.33, 310.75, 2286.24, 426.30, 50.40, …
We re-produce the visualisation of the transaction times we used in previous workbooks.
plot_tbl <- customer_transactions_tbl %>%
group_nest(customer_id, .key = "cust_data") %>%
filter(map_int(cust_data, nrow) > 3) %>%
slice_sample(n = 30) %>%
unnest(cust_data)
ggplot(plot_tbl, aes(x = tnx_timestamp, y = customer_id)) +
geom_line() +
geom_point() +
labs(
x = "Date",
y = "Customer ID",
title = "Visualisation of Customer Transaction Times"
) +
theme(axis.text.y = element_text(size = 10))2 Fit the Fixed Prior P/NBD Model
stan_modeldir <- "stan_models"
stan_codedir <- "stan_code"We first need to construct our fitted dataset from this external data.
In terms of choosing a cut-off point, we will consider all transactions up to and including March 31, 2011.
btyd_fitdata_tbl <- customer_transactions_tbl %>%
calculate_transaction_cbs_data(last_date = as.POSIXct("2011-04-01"))
btyd_fitdata_tbl %>% glimpse()## Rows: 4,716
## Columns: 6
## $ customer_id [3m[38;5;246m<chr>[39m[23m "12346", "12347", "12348", "12349", "12350", "12351", "…
## $ first_tnx_date [3m[38;5;246m<dttm>[39m[23m 2009-12-14 08:34:00, 2010-10-31 14:19:59, 2010-09-27 1…
## $ last_tnx_date [3m[38;5;246m<dttm>[39m[23m 2011-01-18 10:00:59, 2011-01-26 14:29:59, 2011-01-25 1…
## $ x [3m[38;5;246m<dbl>[39m[23m 11, 2, 2, 2, 0, 0, 6, 0, 0, 3, 1, 2, 7, 4, 3, 1, 1, 0, …
## $ t_x [3m[38;5;246m<dbl>[39m[23m 57.151488095, 12.429563492, 17.117361111, 25.970535714,…
## $ T_cal [3m[38;5;246m<dbl>[39m[23m 67.520437, 21.628968, 26.482242, 48.063492, 8.190377, 1…
We also want to construct some summary statistics for the data after that.
btyd_obs_stats_tbl <- customer_transactions_tbl %>%
filter(
tnx_timestamp >= as.POSIXct("2011-04-01")
) %>%
group_by(customer_id) %>%
summarise(
.groups = "drop",
tnx_count = n(),
first_tnx = min(tnx_timestamp),
last_tnx = max(tnx_timestamp)
)
btyd_obs_stats_tbl %>% glimpse()## Rows: 3,849
## Columns: 4
## $ customer_id [3m[38;5;246m<chr>[39m[23m "12347", "12348", "12349", "12352", "12353", "12354", "123…
## $ tnx_count [3m[38;5;246m<int>[39m[23m 5, 2, 1, 3, 1, 1, 1, 2, 1, 2, 2, 3, 9, 2, 4, 1, 1, 2, 2, 1…
## $ first_tnx [3m[38;5;246m<dttm>[39m[23m 2011-04-07 10:43:00, 2011-04-05 10:47:00, 2011-11-21 09:5…
## $ last_tnx [3m[38;5;246m<dttm>[39m[23m 2011-12-07 15:51:59, 2011-09-25 13:13:00, 2011-11-21 09:5…
We now compile this model using CmdStanR.
pnbd_extdata_fixed_stanmodel <- cmdstan_model(
"stan_code/pnbd_fixed.stan",
include_paths = stan_codedir,
pedantic = TRUE,
dir = stan_modeldir
)2.1 Fit the Model
We then use this compiled model with our data to produce a fit of the data.
stan_modelname <- "pnbd_extdata_fixed"
stanfit_prefix <- str_c("fit_", stan_modelname)
stan_data_lst <- btyd_fitdata_tbl %>%
select(customer_id, x, t_x, T_cal) %>%
compose_data(
lambda_mn = 0.25,
lambda_cv = 0.60,
mu_mn = 0.10,
mu_cv = 0.60,
)
pnbd_extdata_fixed_stanfit <- pnbd_extdata_fixed_stanmodel$sample(
data = stan_data_lst,
chains = 4,
iter_warmup = 200,
iter_sampling = 50,
seed = 4201,
save_warmup = TRUE,
output_dir = stan_modeldir,
output_basename = stanfit_prefix,
)## Running MCMC with 4 parallel chains...
##
## Chain 1 Iteration: 1 / 250 [ 0%] (Warmup)
## Chain 2 Iteration: 1 / 250 [ 0%] (Warmup)
## Chain 3 Iteration: 1 / 250 [ 0%] (Warmup)
## Chain 4 Iteration: 1 / 250 [ 0%] (Warmup)
## Chain 1 Iteration: 100 / 250 [ 40%] (Warmup)
## Chain 2 Iteration: 100 / 250 [ 40%] (Warmup)
## Chain 3 Iteration: 100 / 250 [ 40%] (Warmup)
## Chain 4 Iteration: 100 / 250 [ 40%] (Warmup)
## Chain 2 Iteration: 200 / 250 [ 80%] (Warmup)
## Chain 2 Iteration: 201 / 250 [ 80%] (Sampling)
## Chain 1 Iteration: 200 / 250 [ 80%] (Warmup)
## Chain 1 Iteration: 201 / 250 [ 80%] (Sampling)
## Chain 3 Iteration: 200 / 250 [ 80%] (Warmup)
## Chain 3 Iteration: 201 / 250 [ 80%] (Sampling)
## Chain 4 Iteration: 200 / 250 [ 80%] (Warmup)
## Chain 4 Iteration: 201 / 250 [ 80%] (Sampling)
## Chain 2 Iteration: 250 / 250 [100%] (Sampling)
## Chain 2 finished in 46.9 seconds.
## Chain 1 Iteration: 250 / 250 [100%] (Sampling)
## Chain 1 finished in 47.6 seconds.
## Chain 3 Iteration: 250 / 250 [100%] (Sampling)
## Chain 3 finished in 48.8 seconds.
## Chain 4 Iteration: 250 / 250 [100%] (Sampling)
## Chain 4 finished in 48.9 seconds.
##
## All 4 chains finished successfully.
## Mean chain execution time: 48.1 seconds.
## Total execution time: 49.4 seconds.
pnbd_extdata_fixed_stanfit$summary()## # A tibble: 14,149 × 10
## variable mean median sd mad q5 q95 rhat ess_bulk
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 lp__ -1.36e+5 -1.36e+5 69.4 68.9 -1.36e+5 -1.36e+5 1.02 91.2
## 2 lambda[1] 1.90e-1 1.86e-1 0.0473 0.0508 1.18e-1 2.74e-1 1.00 285.
## 3 lambda[2] 1.64e-1 1.53e-1 0.0761 0.0646 6.21e-2 3.17e-1 1.05 79.7
## 4 lambda[3] 1.46e-1 1.31e-1 0.0633 0.0579 5.83e-2 2.61e-1 0.996 303.
## 5 lambda[4] 1.12e-1 1.08e-1 0.0531 0.0540 4.16e-2 2.06e-1 1.05 285.
## 6 lambda[5] 1.88e-1 1.70e-1 0.117 0.111 4.24e-2 3.91e-1 1.01 210.
## 7 lambda[6] 1.85e-1 1.58e-1 0.118 0.0976 4.11e-2 4.20e-1 1.04 168.
## 8 lambda[7] 2.80e-1 2.71e-1 0.109 0.114 1.08e-1 4.69e-1 1.03 163.
## 9 lambda[8] 2.00e-1 1.77e-1 0.134 0.119 3.69e-2 4.77e-1 1.02 446.
## 10 lambda[9] 1.89e-1 1.58e-1 0.120 0.107 3.34e-2 4.30e-1 1.03 107.
## # … with 14,139 more rows, and 1 more variable: ess_tail <dbl>
We have some basic HMC-based validity statistics we can check.
pnbd_extdata_fixed_stanfit$cmdstan_diagnose()## Processing csv files: /home/rstudio/workshop/stan_models/fit_pnbd_extdata_fixed-1.csvWarning: non-fatal error reading adaptation data
## , /home/rstudio/workshop/stan_models/fit_pnbd_extdata_fixed-2.csvWarning: non-fatal error reading adaptation data
## , /home/rstudio/workshop/stan_models/fit_pnbd_extdata_fixed-3.csvWarning: non-fatal error reading adaptation data
## , /home/rstudio/workshop/stan_models/fit_pnbd_extdata_fixed-4.csvWarning: non-fatal error reading adaptation data
##
##
## Checking sampler transitions treedepth.
## Treedepth satisfactory for all transitions.
##
## Checking sampler transitions for divergences.
## No divergent transitions found.
##
## Checking E-BFMI - sampler transitions HMC potential energy.
## E-BFMI satisfactory.
##
## Effective sample size satisfactory.
##
## The following parameters had split R-hat greater than 1.05:
## lambda[425], lambda[826], mu[1791], mu[1795], mu[2331], p_alive[9], p_alive[84], p_alive[110], p_alive[116], p_alive[144], p_alive[145], p_alive[150], p_alive[225], p_alive[227], p_alive[277], p_alive[281], p_alive[364], p_alive[390], p_alive[432], p_alive[443], p_alive[503], p_alive[552], p_alive[580], p_alive[587], p_alive[652], p_alive[687], p_alive[714], p_alive[849], p_alive[863], p_alive[923], p_alive[931], p_alive[1028], p_alive[1029], p_alive[1038], p_alive[1073], p_alive[1107], p_alive[1144], p_alive[1171], p_alive[1330], p_alive[1449], p_alive[1496], p_alive[1498], p_alive[1575], p_alive[1613], p_alive[1646], p_alive[1675], p_alive[1700], p_alive[1717], p_alive[1791], p_alive[1795], p_alive[1846], p_alive[1849], p_alive[1896], p_alive[1908], p_alive[1930], p_alive[2020], p_alive[2041], p_alive[2048], p_alive[2101], p_alive[2116], p_alive[2180], p_alive[2232], p_alive[2261], p_alive[2301], p_alive[2405], p_alive[2508], p_alive[2563], p_alive[2577], p_alive[2677], p_alive[2840], p_alive[2934], p_alive[3023], p_alive[3073], p_alive[3345], p_alive[3397], p_alive[3567], p_alive[3788], p_alive[3817], p_alive[3854], p_alive[3967], p_alive[4010], p_alive[4047], p_alive[4068], p_alive[4090], p_alive[4145], p_alive[4181], p_alive[4290], p_alive[4307], p_alive[4437], p_alive[4509], p_alive[4600]
## Such high values indicate incomplete mixing and biased estimation.
## You should consider regularizating your model with additional prior information or a more effective parameterization.
##
## Processing complete.
2.2 Visual Diagnostics of the Sample Validity
Now that we have a sample from the posterior distribution we need to create a few different visualisations of the diagnostics.
parameter_subset <- c(
"lambda[1]", "lambda[2]", "lambda[3]", "lambda[4]",
"mu[1]", "mu[2]", "mu[3]", "mu[4]"
)
pnbd_extdata_fixed_stanfit$draws(inc_warmup = FALSE) %>%
mcmc_trace(pars = parameter_subset) +
expand_limits(y = 0) +
labs(
x = "Iteration",
y = "Value",
title = "Traceplot of Sample of Lambda and Mu Values"
) +
theme(axis.text.x = element_text(size = 10))A common MCMC diagnostic is \(\hat{R}\) - which is a measure of the ‘similarity’ of the chains.
pnbd_extdata_fixed_stanfit %>%
rhat(pars = c("lambda", "mu")) %>%
mcmc_rhat() +
ggtitle("Plot of Parameter R-hat Values")Related to this quantity is the concept of effective sample size, \(N_{eff}\), an estimate of the size of the sample from a statistical information point of view.
pnbd_extdata_fixed_stanfit %>%
neff_ratio(pars = c("lambda", "mu")) %>%
mcmc_neff() +
ggtitle("Plot of Parameter Effective Sample Sizes")Finally, we also want to look at autocorrelation in the chains for each parameter.
pnbd_extdata_fixed_stanfit$draws() %>%
mcmc_acf(pars = parameter_subset) +
ggtitle("Autocorrelation Plot of Sample Values")2.3 Validate the Fixed Prior Model
pnbd_extdata_fixed_validation_tbl <- pnbd_extdata_fixed_stanfit %>%
recover_types(btyd_fitdata_tbl) %>%
spread_draws(lambda[customer_id], mu[customer_id], p_alive[customer_id]) %>%
ungroup() %>%
select(
customer_id, draw_id = .draw, post_lambda = lambda, post_mu = mu, p_alive
)
pnbd_extdata_fixed_validation_tbl %>% glimpse()## Rows: 943,200
## Columns: 5
## $ customer_id [3m[38;5;246m<chr>[39m[23m "12346", "12346", "12346", "12346", "12346", "12346", "123…
## $ draw_id [3m[38;5;246m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ post_lambda [3m[38;5;246m<dbl>[39m[23m 0.231628, 0.177778, 0.194681, 0.206859, 0.205453, 0.184104…
## $ post_mu [3m[38;5;246m<dbl>[39m[23m 0.0250698, 0.0426753, 0.0444717, 0.0392961, 0.0384251, 0.0…
## $ p_alive [3m[38;5;246m<dbl>[39m[23m 0.434611, 0.368987, 0.329592, 0.346052, 0.354873, 0.396065…
Having constructed our simulations inputs, we now generate our simulations.
precompute_dir <- "precompute/pnbd_extdata_fixed"
precomputed_tbl <- dir_ls(precompute_dir) %>%
enframe(name = NULL, value = "sim_file") %>%
mutate(sim_file = sim_file %>% as.character())
pnbd_extdata_fixed_validsims_lookup_tbl <- pnbd_extdata_fixed_validation_tbl %>%
group_nest(customer_id, .key = "cust_params") %>%
mutate(
sim_file = glue(
"{precompute_dir}/sims_pnbd_extdata_fixed_{customer_id}.rds"
)
)
exec_tbl <- pnbd_extdata_fixed_validsims_lookup_tbl %>%
anti_join(precomputed_tbl, by = "sim_file")
if(exec_tbl %>% nrow() > 0) {
exec_tbl %>%
mutate(
calc_file = future_map2_lgl(
sim_file, cust_params,
run_pnbd_simulations_chunk,
start_dttm = as.POSIXct("2011-04-01"),
end_dttm = as.POSIXct("2011-12-10"),
.options = furrr_options(
globals = c(
"calculate_event_times", "rgamma_mucv", "gamma_mucv2shaperate",
"generate_pnbd_validation_transactions"
),
packages = c("tidyverse", "fs"),
scheduling = FALSE,
seed = 4202
),
.progress = TRUE
)
)
}
exec_tbl %>% glimpse()## Rows: 0
## Columns: 3
## $ customer_id [3m[38;5;246m<chr>[39m[23m
## $ cust_params [3m[38;5;246m<list<tibble[,4]>>[39m[23m list()
## $ sim_file [3m[38;5;246m<glue>[39m[23m
pnbd_extdata_fixed_validsims_lookup_tbl %>% glimpse()## Rows: 4,716
## Columns: 3
## $ customer_id [3m[38;5;246m<chr>[39m[23m "12346", "12347", "12348", "12349", "12350", "12351", "123…
## $ cust_params [3m[38;5;246m<list<tibble[,4]>>[39m[23m [<tbl_df[200 x 4]>], [<tbl_df[200 x 4]>], [<t…
## $ sim_file [3m[38;5;246m<glue>[39m[23m "precompute/pnbd_extdata_fixed/sims_pnbd_extdata_fixed_12…
We now load all the simulations into a file.
pnbd_extdata_fixed_validsims_tbl <- pnbd_extdata_fixed_validsims_lookup_tbl %>%
mutate(
data = map(sim_file, ~ .x %>% read_rds() %>% select(draw_id, sim_tnx_count, sim_tnx_last))
) %>%
select(customer_id, sim_file, data) %>%
unnest(data)
pnbd_extdata_fixed_validsims_tbl %>% glimpse()## Rows: 943,200
## Columns: 5
## $ customer_id <chr> "12346", "12346", "12346", "12346", "12346", "12346", "1…
## $ sim_file <glue> "precompute/pnbd_extdata_fixed/sims_pnbd_extdata_fixed_…
## $ draw_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ sim_tnx_count <int> 0, 2, 0, 0, 0, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 2, 1,…
## $ sim_tnx_last <dttm> NA, 2011-04-17 13:22:25, NA, NA, NA, 2011-09-02 07:12:2…
tnx_data_tbl <- btyd_obs_stats_tbl %>%
semi_join(pnbd_extdata_fixed_validsims_tbl, by = "customer_id")
obs_customer_count <- tnx_data_tbl %>% nrow()
obs_total_tnx_count <- tnx_data_tbl %>% pull(tnx_count) %>% sum()
pnbd_extdata_fixed_tnx_simsumm_tbl <- pnbd_extdata_fixed_validsims_tbl %>%
group_by(draw_id) %>%
summarise(
.groups = "drop",
sim_customer_count = length(sim_tnx_count[sim_tnx_count > 0]),
sim_total_tnx_count = sum(sim_tnx_count)
)
ggplot(pnbd_extdata_fixed_tnx_simsumm_tbl) +
geom_histogram(aes(x = sim_customer_count), binwidth = 10) +
geom_vline(aes(xintercept = obs_customer_count), colour = "red") +
labs(
x = "Simulated Customers With Transactions",
y = "Frequency",
title = "Histogram of Count of Customers Transacted",
subtitle = "Observed Count in Red"
)ggplot(pnbd_extdata_fixed_tnx_simsumm_tbl) +
geom_histogram(aes(x = sim_total_tnx_count), binwidth = 50) +
geom_vline(aes(xintercept = obs_total_tnx_count), colour = "red") +
labs(
x = "Simulated Transaction Count",
y = "Frequency",
title = "Histogram of Count of Total Transaction Count",
subtitle = "Observed Count in Red"
)2.4 Write to Disk
pnbd_extdata_fixed_validsims_tbl %>% write_rds("data/pnbd_extdata_fixed_validsims_tbl.rds")3 Fit Alternative Prior Model
3.1 Fit the Model
We then use this compiled model with our data to produce a fit of the data.
stan_modelname <- "pnbd_extdata_fixed2"
stanfit_prefix <- str_c("fit_", stan_modelname)
stan_data_lst <- btyd_fitdata_tbl %>%
select(customer_id, x, t_x, T_cal) %>%
compose_data(
lambda_mn = 0.50,
lambda_cv = 1.00,
mu_mn = 0.05,
mu_cv = 1.00,
)
pnbd_extdata_fixed2_stanfit <- pnbd_extdata_fixed_stanmodel$sample(
data = stan_data_lst,
chains = 4,
iter_warmup = 200,
iter_sampling = 50,
seed = 4202,
save_warmup = TRUE,
output_dir = stan_modeldir,
output_basename = stanfit_prefix,
)## Running MCMC with 4 parallel chains...
##
## Chain 1 Iteration: 1 / 250 [ 0%] (Warmup)
## Chain 2 Iteration: 1 / 250 [ 0%] (Warmup)
## Chain 3 Iteration: 1 / 250 [ 0%] (Warmup)
## Chain 4 Iteration: 1 / 250 [ 0%] (Warmup)
## Chain 2 Iteration: 100 / 250 [ 40%] (Warmup)
## Chain 1 Iteration: 100 / 250 [ 40%] (Warmup)
## Chain 3 Iteration: 100 / 250 [ 40%] (Warmup)
## Chain 4 Iteration: 100 / 250 [ 40%] (Warmup)
## Chain 2 Iteration: 200 / 250 [ 80%] (Warmup)
## Chain 2 Iteration: 201 / 250 [ 80%] (Sampling)
## Chain 3 Iteration: 200 / 250 [ 80%] (Warmup)
## Chain 3 Iteration: 201 / 250 [ 80%] (Sampling)
## Chain 1 Iteration: 200 / 250 [ 80%] (Warmup)
## Chain 1 Iteration: 201 / 250 [ 80%] (Sampling)
## Chain 4 Iteration: 200 / 250 [ 80%] (Warmup)
## Chain 4 Iteration: 201 / 250 [ 80%] (Sampling)
## Chain 2 Iteration: 250 / 250 [100%] (Sampling)
## Chain 2 finished in 64.5 seconds.
## Chain 4 Iteration: 250 / 250 [100%] (Sampling)
## Chain 4 finished in 69.1 seconds.
## Chain 3 Iteration: 250 / 250 [100%] (Sampling)
## Chain 1 Iteration: 250 / 250 [100%] (Sampling)
## Chain 3 finished in 72.6 seconds.
## Chain 1 finished in 73.0 seconds.
##
## All 4 chains finished successfully.
## Mean chain execution time: 69.8 seconds.
## Total execution time: 73.1 seconds.
pnbd_extdata_fixed2_stanfit$summary()## # A tibble: 14,149 × 10
## variable mean median sd mad q5 q95 rhat ess_bulk
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 lp__ -8.63e+4 -8.63e+4 78.6 76.6 -8.65e+4 -8.62e+4 1.16 21.2
## 2 lambda[1] 1.84e-1 1.77e-1 0.0539 0.0572 1.05e-1 2.64e-1 1.00 286.
## 3 lambda[2] 1.49e-1 1.21e-1 0.103 0.0760 2.90e-2 3.41e-1 0.997 249.
## 4 lambda[3] 1.12e-1 9.09e-2 0.0826 0.0591 2.76e-2 2.74e-1 1.00 248.
## 5 lambda[4] 7.52e-2 5.97e-2 0.0529 0.0432 1.73e-2 1.73e-1 1.01 210.
## 6 lambda[5] 1.81e-1 1.02e-1 0.232 0.117 7.37e-3 5.89e-1 0.999 325.
## 7 lambda[6] 2.32e-1 8.23e-2 0.371 0.0979 6.11e-3 9.36e-1 0.999 310.
## 8 lambda[7] 3.24e-1 3.15e-1 0.120 0.128 1.44e-1 5.38e-1 1.01 436.
## 9 lambda[8] 1.62e-1 8.32e-2 0.231 0.0959 2.30e-3 6.60e-1 1.02 150.
## 10 lambda[9] 1.99e-1 1.10e-1 0.276 0.131 7.06e-3 6.02e-1 1.01 240.
## # … with 14,139 more rows, and 1 more variable: ess_tail <dbl>
We have some basic HMC-based validity statistics we can check.
pnbd_extdata_fixed2_stanfit$cmdstan_diagnose()## Processing csv files: /home/rstudio/workshop/stan_models/fit_pnbd_extdata_fixed2-1.csvWarning: non-fatal error reading adaptation data
## , /home/rstudio/workshop/stan_models/fit_pnbd_extdata_fixed2-2.csvWarning: non-fatal error reading adaptation data
## , /home/rstudio/workshop/stan_models/fit_pnbd_extdata_fixed2-3.csvWarning: non-fatal error reading adaptation data
## , /home/rstudio/workshop/stan_models/fit_pnbd_extdata_fixed2-4.csvWarning: non-fatal error reading adaptation data
##
##
## Checking sampler transitions treedepth.
## Treedepth satisfactory for all transitions.
##
## Checking sampler transitions for divergences.
## No divergent transitions found.
##
## Checking E-BFMI - sampler transitions HMC potential energy.
## E-BFMI satisfactory.
##
## Effective sample size satisfactory.
##
## The following parameters had split R-hat greater than 1.05:
## lambda[1890], lambda[3387], lambda[3614], lambda[3865], mu[1145], mu[2117], mu[2243], mu[2560], mu[2697], mu[4046], p_alive[19], p_alive[79], p_alive[193], p_alive[227], p_alive[248], p_alive[303], p_alive[330], p_alive[375], p_alive[383], p_alive[438], p_alive[444], p_alive[474], p_alive[484], p_alive[494], p_alive[501], p_alive[503], p_alive[526], p_alive[536], p_alive[554], p_alive[586], p_alive[595], p_alive[597], p_alive[633], p_alive[641], p_alive[642], p_alive[649], p_alive[650], p_alive[661], p_alive[717], p_alive[718], p_alive[727], p_alive[747], p_alive[748], p_alive[759], p_alive[782], p_alive[788], p_alive[792], p_alive[794], p_alive[825], p_alive[847], p_alive[888], p_alive[911], p_alive[920], p_alive[940], p_alive[946], p_alive[957], p_alive[997], p_alive[1017], p_alive[1029], p_alive[1033], p_alive[1039], p_alive[1042], p_alive[1050], p_alive[1096], p_alive[1104], p_alive[1117], p_alive[1145], p_alive[1146], p_alive[1148], p_alive[1163], p_alive[1202], p_alive[1207], p_alive[1209], p_alive[1226], p_alive[1237], p_alive[1256], p_alive[1343], p_alive[1374], p_alive[1397], p_alive[1427], p_alive[1485], p_alive[1490], p_alive[1495], p_alive[1496], p_alive[1518], p_alive[1525], p_alive[1532], p_alive[1555], p_alive[1559], p_alive[1566], p_alive[1614], p_alive[1627], p_alive[1654], p_alive[1676], p_alive[1681], p_alive[1695], p_alive[1733], p_alive[1742], p_alive[1756], p_alive[1810], p_alive[1813], p_alive[1841], p_alive[1858], p_alive[1890], p_alive[1912], p_alive[1913], p_alive[1922], p_alive[1923], p_alive[1959], p_alive[1993], p_alive[2000], p_alive[2071], p_alive[2073], p_alive[2085], p_alive[2089], p_alive[2111], p_alive[2117], p_alive[2125], p_alive[2155], p_alive[2163], p_alive[2230], p_alive[2234], p_alive[2243], p_alive[2268], p_alive[2271], p_alive[2289], p_alive[2296], p_alive[2301], p_alive[2309], p_alive[2312], p_alive[2316], p_alive[2324], p_alive[2349], p_alive[2368], p_alive[2395], p_alive[2418], p_alive[2422], p_alive[2431], p_alive[2460], p_alive[2470], p_alive[2488], p_alive[2525], p_alive[2560], p_alive[2561], p_alive[2582], p_alive[2586], p_alive[2607], p_alive[2644], p_alive[2646], p_alive[2648], p_alive[2661], p_alive[2665], p_alive[2668], p_alive[2681], p_alive[2697], p_alive[2698], p_alive[2759], p_alive[2778], p_alive[2788], p_alive[2818], p_alive[2861], p_alive[2865], p_alive[2892], p_alive[2968], p_alive[2980], p_alive[2988], p_alive[3003], p_alive[3023], p_alive[3036], p_alive[3042], p_alive[3068], p_alive[3114], p_alive[3127], p_alive[3143], p_alive[3144], p_alive[3157], p_alive[3160], p_alive[3163], p_alive[3183], p_alive[3221], p_alive[3245], p_alive[3274], p_alive[3292], p_alive[3316], p_alive[3350], p_alive[3378], p_alive[3391], p_alive[3398], p_alive[3420], p_alive[3425], p_alive[3426], p_alive[3433], p_alive[3448], p_alive[3494], p_alive[3515], p_alive[3545], p_alive[3548], p_alive[3583], p_alive[3605], p_alive[3692], p_alive[3698], p_alive[3743], p_alive[3766], p_alive[3773], p_alive[3881], p_alive[3920], p_alive[3971], p_alive[3978], p_alive[4013], p_alive[4020], p_alive[4046], p_alive[4067], p_alive[4070], p_alive[4101], p_alive[4112], p_alive[4142], p_alive[4160], p_alive[4163], p_alive[4181], p_alive[4190], p_alive[4210], p_alive[4216], p_alive[4225], p_alive[4279], p_alive[4288], p_alive[4307], p_alive[4399], p_alive[4437], p_alive[4439], p_alive[4453], p_alive[4469], p_alive[4486], p_alive[4488], p_alive[4508], p_alive[4514], p_alive[4528], p_alive[4540], p_alive[4557], p_alive[4581], p_alive[4626], p_alive[4627], p_alive[4645], p_alive[4648], p_alive[4690], p_alive[4699], p_alive[4709], p_alive[4715]
## Such high values indicate incomplete mixing and biased estimation.
## You should consider regularizating your model with additional prior information or a more effective parameterization.
##
## Processing complete.
3.2 Visual Diagnostics of the Sample Validity
Now that we have a sample from the posterior distribution we need to create a few different visualisations of the diagnostics.
parameter_subset <- c(
"lambda[1]", "lambda[2]", "lambda[3]", "lambda[4]",
"mu[1]", "mu[2]", "mu[3]", "mu[4]"
)
pnbd_extdata_fixed2_stanfit$draws(inc_warmup = FALSE) %>%
mcmc_trace(pars = parameter_subset) +
expand_limits(y = 0) +
labs(
x = "Iteration",
y = "Value",
title = "Traceplot of Sample of Lambda and Mu Values"
) +
theme(axis.text.x = element_text(size = 10))A common MCMC diagnostic is \(\hat{R}\) - which is a measure of the ‘similarity’ of the chains.
pnbd_extdata_fixed2_stanfit %>%
rhat(pars = c("lambda", "mu")) %>%
mcmc_rhat() +
ggtitle("Plot of Parameter R-hat Values")Related to this quantity is the concept of effective sample size, \(N_{eff}\), an estimate of the size of the sample from a statistical information point of view.
pnbd_extdata_fixed2_stanfit %>%
neff_ratio(pars = c("lambda", "mu")) %>%
mcmc_neff() +
ggtitle("Plot of Parameter Effective Sample Sizes")Finally, we also want to look at autocorrelation in the chains for each parameter.
pnbd_extdata_fixed2_stanfit$draws() %>%
mcmc_acf(pars = parameter_subset) +
ggtitle("Autocorrelation Plot of Sample Values")3.3 Validate the Alternate Prior Model
pnbd_extdata_fixed2_validation_tbl <- pnbd_extdata_fixed2_stanfit %>%
recover_types(btyd_fitdata_tbl) %>%
spread_draws(lambda[customer_id], mu[customer_id], p_alive[customer_id]) %>%
ungroup() %>%
select(
customer_id, draw_id = .draw, post_lambda = lambda, post_mu = mu, p_alive
)
pnbd_extdata_fixed2_validation_tbl %>% glimpse()## Rows: 943,200
## Columns: 5
## $ customer_id [3m[38;5;246m<chr>[39m[23m "12346", "12346", "12346", "12346", "12346", "12346", "123…
## $ draw_id [3m[38;5;246m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ post_lambda [3m[38;5;246m<dbl>[39m[23m 0.135532, 0.221596, 0.146430, 0.136745, 0.220450, 0.104887…
## $ post_mu [3m[38;5;246m<dbl>[39m[23m 0.008837510, 0.008832730, 0.002376910, 0.003333760, 0.0015…
## $ p_alive [3m[38;5;246m<dbl>[39m[23m 0.824880, 0.724790, 0.944504, 0.927721, 0.939766, 0.733008…
Having constructed our simulations inputs, we now generate our simulations.
precompute_dir <- "precompute/pnbd_extdata_fixed2"
precomputed_tbl <- dir_ls(precompute_dir) %>%
enframe(name = NULL, value = "sim_file") %>%
mutate(sim_file = sim_file %>% as.character())
pnbd_extdata_fixed2_validsims_lookup_tbl <- pnbd_extdata_fixed2_validation_tbl %>%
group_nest(customer_id, .key = "cust_params") %>%
mutate(
sim_file = glue(
"{precompute_dir}/sims_pnbd_extdata_fixed2_smalliter_{customer_id}.rds"
)
)
exec_tbl <- pnbd_extdata_fixed2_validsims_lookup_tbl %>%
anti_join(precomputed_tbl, by = "sim_file")
if(exec_tbl %>% nrow() > 0) {
exec_tbl %>%
mutate(
calc_file = future_map2_lgl(
sim_file, cust_params,
run_pnbd_simulations_chunk,
start_dttm = as.POSIXct("2011-04-01"),
end_dttm = as.POSIXct("2011-12-10"),
.options = furrr_options(
globals = c(
"calculate_event_times", "rgamma_mucv", "gamma_mucv2shaperate",
"generate_pnbd_validation_transactions"
),
packages = c("tidyverse", "fs"),
scheduling = FALSE,
seed = 4202
),
.progress = TRUE
)
)
}
exec_tbl %>% glimpse()## Rows: 0
## Columns: 3
## $ customer_id [3m[38;5;246m<chr>[39m[23m
## $ cust_params [3m[38;5;246m<list<tibble[,4]>>[39m[23m list()
## $ sim_file [3m[38;5;246m<glue>[39m[23m
pnbd_extdata_fixed2_validsims_lookup_tbl %>% glimpse()## Rows: 4,716
## Columns: 3
## $ customer_id [3m[38;5;246m<chr>[39m[23m "12346", "12347", "12348", "12349", "12350", "12351", "123…
## $ cust_params [3m[38;5;246m<list<tibble[,4]>>[39m[23m [<tbl_df[200 x 4]>], [<tbl_df[200 x 4]>], [<t…
## $ sim_file [3m[38;5;246m<glue>[39m[23m "precompute/pnbd_extdata_fixed2/sims_pnbd_extdata_fixed2_…
We now load all the simulations into a file.
pnbd_extdata_fixed2_validsims_tbl <- pnbd_extdata_fixed2_validsims_lookup_tbl %>%
mutate(
data = map(sim_file, ~ .x %>% read_rds() %>% select(draw_id, sim_tnx_count, sim_tnx_last))
) %>%
select(customer_id, data) %>%
unnest(data)
pnbd_extdata_fixed2_validsims_tbl %>% glimpse()## Rows: 943,200
## Columns: 4
## $ customer_id <chr> "12346", "12346", "12346", "12346", "12346", "12346", "1…
## $ draw_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ sim_tnx_count <int> 4, 4, 5, 2, 9, 2, 0, 3, 4, 4, 2, 8, 0, 5, 0, 0, 3, 6, 1,…
## $ sim_tnx_last <dttm> 2011-10-20 14:05:55, 2011-10-24 23:36:24, 2011-11-27 20…
tnx_data_tbl <- btyd_obs_stats_tbl %>%
semi_join(pnbd_extdata_fixed2_validsims_tbl, by = "customer_id")
obs_customer_count <- tnx_data_tbl %>% nrow()
obs_total_tnx_count <- tnx_data_tbl %>% pull(tnx_count) %>% sum()
pnbd_extdata_fixed2_tnx_simsumm_tbl <- pnbd_extdata_fixed2_validsims_tbl %>%
group_by(draw_id) %>%
summarise(
.groups = "drop",
sim_customer_count = length(sim_tnx_count[sim_tnx_count > 0]),
sim_total_tnx_count = sum(sim_tnx_count)
)
ggplot(pnbd_extdata_fixed2_tnx_simsumm_tbl) +
geom_histogram(aes(x = sim_customer_count), binwidth = 10) +
geom_vline(aes(xintercept = obs_customer_count), colour = "red") +
labs(
x = "Simulated Customers With Transactions",
y = "Frequency",
title = "Histogram of Count of Customers Transacted",
subtitle = "Observed Count in Red"
)ggplot(pnbd_extdata_fixed2_tnx_simsumm_tbl) +
geom_histogram(aes(x = sim_total_tnx_count), binwidth = 50) +
geom_vline(aes(xintercept = obs_total_tnx_count), colour = "red") +
labs(
x = "Simulated Transaction Count",
y = "Frequency",
title = "Histogram of Count of Total Transaction Count",
subtitle = "Observed Count in Red"
)3.4 Refit with Higher Iteration Samples
We now want to refit this model with a higher iteration count.
stan_modelname <- "pnbd_extdata_fixed2"
stanfit_prefix <- str_c("fit_", stan_modelname)
stan_data_lst <- btyd_fitdata_tbl %>%
select(customer_id, x, t_x, T_cal) %>%
compose_data(
lambda_mn = 0.50,
lambda_cv = 1.00,
mu_mn = 0.05,
mu_cv = 1.00,
)
pnbd_extdata_fixed2_stanfit <- pnbd_extdata_fixed_stanmodel$sample(
data = stan_data_lst,
chains = 4,
iter_warmup = 500,
iter_sampling = 500,
seed = 4203,
save_warmup = TRUE,
output_dir = stan_modeldir,
output_basename = stanfit_prefix,
)## Running MCMC with 4 parallel chains...
##
## Chain 1 Iteration: 1 / 1000 [ 0%] (Warmup)
## Chain 2 Iteration: 1 / 1000 [ 0%] (Warmup)
## Chain 3 Iteration: 1 / 1000 [ 0%] (Warmup)
## Chain 4 Iteration: 1 / 1000 [ 0%] (Warmup)
## Chain 4 Iteration: 100 / 1000 [ 10%] (Warmup)
## Chain 2 Iteration: 100 / 1000 [ 10%] (Warmup)
## Chain 3 Iteration: 100 / 1000 [ 10%] (Warmup)
## Chain 1 Iteration: 100 / 1000 [ 10%] (Warmup)
## Chain 4 Iteration: 200 / 1000 [ 20%] (Warmup)
## Chain 3 Iteration: 200 / 1000 [ 20%] (Warmup)
## Chain 2 Iteration: 200 / 1000 [ 20%] (Warmup)
## Chain 1 Iteration: 200 / 1000 [ 20%] (Warmup)
## Chain 4 Iteration: 300 / 1000 [ 30%] (Warmup)
## Chain 2 Iteration: 300 / 1000 [ 30%] (Warmup)
## Chain 3 Iteration: 300 / 1000 [ 30%] (Warmup)
## Chain 1 Iteration: 300 / 1000 [ 30%] (Warmup)
## Chain 4 Iteration: 400 / 1000 [ 40%] (Warmup)
## Chain 3 Iteration: 400 / 1000 [ 40%] (Warmup)
## Chain 2 Iteration: 400 / 1000 [ 40%] (Warmup)
## Chain 1 Iteration: 400 / 1000 [ 40%] (Warmup)
## Chain 4 Iteration: 500 / 1000 [ 50%] (Warmup)
## Chain 4 Iteration: 501 / 1000 [ 50%] (Sampling)
## Chain 3 Iteration: 500 / 1000 [ 50%] (Warmup)
## Chain 3 Iteration: 501 / 1000 [ 50%] (Sampling)
## Chain 2 Iteration: 500 / 1000 [ 50%] (Warmup)
## Chain 2 Iteration: 501 / 1000 [ 50%] (Sampling)
## Chain 1 Iteration: 500 / 1000 [ 50%] (Warmup)
## Chain 1 Iteration: 501 / 1000 [ 50%] (Sampling)
## Chain 4 Iteration: 600 / 1000 [ 60%] (Sampling)
## Chain 3 Iteration: 600 / 1000 [ 60%] (Sampling)
## Chain 2 Iteration: 600 / 1000 [ 60%] (Sampling)
## Chain 1 Iteration: 600 / 1000 [ 60%] (Sampling)
## Chain 4 Iteration: 700 / 1000 [ 70%] (Sampling)
## Chain 3 Iteration: 700 / 1000 [ 70%] (Sampling)
## Chain 2 Iteration: 700 / 1000 [ 70%] (Sampling)
## Chain 1 Iteration: 700 / 1000 [ 70%] (Sampling)
## Chain 4 Iteration: 800 / 1000 [ 80%] (Sampling)
## Chain 3 Iteration: 800 / 1000 [ 80%] (Sampling)
## Chain 2 Iteration: 800 / 1000 [ 80%] (Sampling)
## Chain 1 Iteration: 800 / 1000 [ 80%] (Sampling)
## Chain 4 Iteration: 900 / 1000 [ 90%] (Sampling)
## Chain 3 Iteration: 900 / 1000 [ 90%] (Sampling)
## Chain 2 Iteration: 900 / 1000 [ 90%] (Sampling)
## Chain 1 Iteration: 900 / 1000 [ 90%] (Sampling)
## Chain 4 Iteration: 1000 / 1000 [100%] (Sampling)
## Chain 4 finished in 181.5 seconds.
## Chain 3 Iteration: 1000 / 1000 [100%] (Sampling)
## Chain 3 finished in 182.1 seconds.
## Chain 2 Iteration: 1000 / 1000 [100%] (Sampling)
## Chain 2 finished in 183.8 seconds.
## Chain 1 Iteration: 1000 / 1000 [100%] (Sampling)
## Chain 1 finished in 185.5 seconds.
##
## All 4 chains finished successfully.
## Mean chain execution time: 183.2 seconds.
## Total execution time: 185.8 seconds.
pnbd_extdata_fixed2_stanfit$summary()## # A tibble: 14,149 × 10
## variable mean median sd mad q5 q95 rhat ess_bulk
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 lp__ -8.63e+4 -8.63e+4 76.7 76.5 -8.65e+4 -8.62e+4 1.00 692.
## 2 lambda[1] 1.77e-1 1.73e-1 0.0494 0.0489 1.03e-1 2.64e-1 1.00 1907.
## 3 lambda[2] 1.44e-1 1.24e-1 0.0905 0.0772 3.61e-2 3.20e-1 1.00 1659.
## 4 lambda[3] 1.11e-1 9.89e-2 0.0638 0.0594 3.05e-2 2.30e-1 0.999 1716.
## 5 lambda[4] 7.62e-2 6.44e-2 0.0484 0.0384 2.01e-2 1.69e-1 1.00 2011.
## 6 lambda[5] 1.78e-1 9.78e-2 0.234 0.108 6.28e-3 6.32e-1 1.00 1882.
## 7 lambda[6] 1.96e-1 9.55e-2 0.277 0.109 7.05e-3 7.43e-1 1.00 1571.
## 8 lambda[7] 3.17e-1 3.02e-1 0.118 0.114 1.52e-1 5.39e-1 1.00 2202.
## 9 lambda[8] 1.98e-1 9.89e-2 0.272 0.118 5.98e-3 7.09e-1 1.00 1613.
## 10 lambda[9] 2.02e-1 9.45e-2 0.276 0.117 5.14e-3 7.74e-1 1.00 2264.
## # … with 14,139 more rows, and 1 more variable: ess_tail <dbl>
We have some basic HMC-based validity statistics we can check.
pnbd_extdata_fixed2_stanfit$cmdstan_diagnose()## Processing csv files: /home/rstudio/workshop/stan_models/fit_pnbd_extdata_fixed2-1.csvWarning: non-fatal error reading adaptation data
## , /home/rstudio/workshop/stan_models/fit_pnbd_extdata_fixed2-2.csvWarning: non-fatal error reading adaptation data
## , /home/rstudio/workshop/stan_models/fit_pnbd_extdata_fixed2-3.csvWarning: non-fatal error reading adaptation data
## , /home/rstudio/workshop/stan_models/fit_pnbd_extdata_fixed2-4.csvWarning: non-fatal error reading adaptation data
##
##
## Checking sampler transitions treedepth.
## Treedepth satisfactory for all transitions.
##
## Checking sampler transitions for divergences.
## No divergent transitions found.
##
## Checking E-BFMI - sampler transitions HMC potential energy.
## E-BFMI satisfactory.
##
## Effective sample size satisfactory.
##
## Split R-hat values satisfactory all parameters.
##
## Processing complete, no problems detected.
3.5 Visual Diagnostics of the Higher Iteration Sample
Now that we have a sample from the posterior distribution we need to create a few different visualisations of the diagnostics.
parameter_subset <- c(
"lambda[1]", "lambda[2]", "lambda[3]", "lambda[4]",
"mu[1]", "mu[2]", "mu[3]", "mu[4]"
)
pnbd_extdata_fixed2_stanfit$draws(inc_warmup = FALSE) %>%
mcmc_trace(pars = parameter_subset) +
expand_limits(y = 0) +
labs(
x = "Iteration",
y = "Value",
title = "Traceplot of Sample of Lambda and Mu Values"
) +
theme(axis.text.x = element_text(size = 10))A common MCMC diagnostic is \(\hat{R}\) - which is a measure of the ‘similarity’ of the chains.
pnbd_extdata_fixed2_stanfit %>%
rhat(pars = c("lambda", "mu")) %>%
mcmc_rhat() +
ggtitle("Plot of Parameter R-hat Values")Related to this quantity is the concept of effective sample size, \(N_{eff}\), an estimate of the size of the sample from a statistical information point of view.
pnbd_extdata_fixed2_stanfit %>%
neff_ratio(pars = c("lambda", "mu")) %>%
mcmc_neff() +
ggtitle("Plot of Parameter Effective Sample Sizes")Finally, we also want to look at autocorrelation in the chains for each parameter.
pnbd_extdata_fixed2_stanfit$draws() %>%
mcmc_acf(pars = parameter_subset) +
ggtitle("Autocorrelation Plot of Sample Values")3.6 Validate the Higher Iteration Model
We now want to revalidate this model, so we repeat our steps.
pnbd_extdata_fixed2_validation_tbl <- pnbd_extdata_fixed2_stanfit %>%
recover_types(btyd_fitdata_tbl) %>%
spread_draws(lambda[customer_id], mu[customer_id], p_alive[customer_id]) %>%
ungroup() %>%
select(
customer_id, draw_id = .draw, post_lambda = lambda, post_mu = mu, p_alive
)
pnbd_extdata_fixed2_validation_tbl %>% glimpse()## Rows: 9,432,000
## Columns: 5
## $ customer_id [3m[38;5;246m<chr>[39m[23m "12346", "12346", "12346", "12346", "12346", "12346", "123…
## $ draw_id [3m[38;5;246m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ post_lambda [3m[38;5;246m<dbl>[39m[23m 0.177912, 0.192083, 0.189967, 0.146978, 0.136297, 0.232329…
## $ post_mu [3m[38;5;246m<dbl>[39m[23m 0.030414800, 0.001790260, 0.042978100, 0.014361600, 0.0156…
## $ p_alive [3m[38;5;246m<dbl>[39m[23m 0.471672, 0.943662, 0.347124, 0.721901, 0.716274, 0.736041…
Having constructed our simulations inputs, we now generate our simulations.
precompute_dir <- "precompute/pnbd_extdata_fixed2"
precomputed_tbl <- dir_ls(precompute_dir) %>%
enframe(name = NULL, value = "sim_file") %>%
mutate(sim_file = sim_file %>% as.character())
pnbd_extdata_fixed2_validsims_lookup_tbl <- pnbd_extdata_fixed2_validation_tbl %>%
group_nest(customer_id, .key = "cust_params") %>%
mutate(
sim_file = glue(
"{precompute_dir}/sims_pnbd_extdata_fixed2_highiter_{customer_id}.rds"
)
)
exec_tbl <- pnbd_extdata_fixed2_validsims_lookup_tbl %>%
anti_join(precomputed_tbl, by = "sim_file")
if(exec_tbl %>% nrow() > 0) {
exec_tbl %>%
mutate(
calc_file = future_map2_lgl(
sim_file, cust_params,
run_pnbd_simulations_chunk,
start_dttm = as.POSIXct("2011-04-01"),
end_dttm = as.POSIXct("2011-12-10"),
.options = furrr_options(
globals = c(
"calculate_event_times", "rgamma_mucv", "gamma_mucv2shaperate",
"generate_pnbd_validation_transactions"
),
packages = c("tidyverse", "fs"),
scheduling = FALSE,
seed = 4202
),
.progress = TRUE
)
)
}
exec_tbl %>% glimpse()## Rows: 0
## Columns: 3
## $ customer_id [3m[38;5;246m<chr>[39m[23m
## $ cust_params [3m[38;5;246m<list<tibble[,4]>>[39m[23m list()
## $ sim_file [3m[38;5;246m<glue>[39m[23m
pnbd_extdata_fixed2_validsims_lookup_tbl %>% glimpse()## Rows: 4,716
## Columns: 3
## $ customer_id [3m[38;5;246m<chr>[39m[23m "12346", "12347", "12348", "12349", "12350", "12351", "123…
## $ cust_params [3m[38;5;246m<list<tibble[,4]>>[39m[23m [<tbl_df[2000 x 4]>], [<tbl_df[2000 x 4]>], […
## $ sim_file [3m[38;5;246m<glue>[39m[23m "precompute/pnbd_extdata_fixed2/sims_pnbd_extdata_fixed2_…
We now load all the simulations into a file.
pnbd_extdata_fixed2_validsims_tbl <- pnbd_extdata_fixed2_validsims_lookup_tbl %>%
mutate(
data = map(sim_file, ~ .x %>% read_rds() %>% select(draw_id, sim_tnx_count, sim_tnx_last))
) %>%
select(customer_id, data) %>%
unnest(data)
pnbd_extdata_fixed2_validsims_tbl %>% glimpse()## Rows: 9,432,000
## Columns: 4
## $ customer_id <chr> "12346", "12346", "12346", "12346", "12346", "12346", "1…
## $ draw_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ sim_tnx_count <int> 0, 7, 0, 6, 2, 5, 0, 6, 0, 8, 1, 8, 0, 8, 2, 0, 0, 6, 0,…
## $ sim_tnx_last <dttm> NA, 2011-12-03 21:14:33, NA, 2011-12-01 03:19:22, 2011-…
tnx_data_tbl <- btyd_obs_stats_tbl %>%
semi_join(pnbd_extdata_fixed2_validsims_tbl, by = "customer_id")
obs_customer_count <- tnx_data_tbl %>% nrow()
obs_total_tnx_count <- tnx_data_tbl %>% pull(tnx_count) %>% sum()
pnbd_extdata_fixed2_tnx_simsumm_tbl <- pnbd_extdata_fixed2_validsims_tbl %>%
group_by(draw_id) %>%
summarise(
.groups = "drop",
sim_customer_count = length(sim_tnx_count[sim_tnx_count > 0]),
sim_total_tnx_count = sum(sim_tnx_count)
)
ggplot(pnbd_extdata_fixed2_tnx_simsumm_tbl) +
geom_histogram(aes(x = sim_customer_count), binwidth = 10) +
geom_vline(aes(xintercept = obs_customer_count), colour = "red") +
labs(
x = "Simulated Customers With Transactions",
y = "Frequency",
title = "Histogram of Count of Customers Transacted",
subtitle = "Observed Count in Red"
)ggplot(pnbd_extdata_fixed2_tnx_simsumm_tbl) +
geom_histogram(aes(x = sim_total_tnx_count), binwidth = 50) +
geom_vline(aes(xintercept = obs_total_tnx_count), colour = "red") +
labs(
x = "Simulated Transaction Count",
y = "Frequency",
title = "Histogram of Count of Total Transaction Count",
subtitle = "Observed Count in Red"
)3.7 Write to Disk
pnbd_extdata_fixed2_validsims_tbl %>% write_rds("data/pnbd_extdata_fixed2_validsims_tbl.rds")4 Fit Mu Hierarchical Model
We now want to add a hierarchy to the model by adding hierarchical priors to our P/NBD model. In particular, we focus on adding a prior for \(\mu\) as we have more confidence in our estimates for \(\lambda\) and so we want to model our uncertainty in the lifetime of the customer.
## functions {
## #include util_functions.stan
## }
##
## data {
## int<lower=1> n; // number of customers
##
## vector<lower=0>[n] t_x; // time to most recent purchase
## vector<lower=0>[n] T_cal; // total observation time
## vector<lower=0>[n] x; // number of purchases observed
##
## real<lower=0> lambda_mn; // prior mean for lambda
## real<lower=0> lambda_cv; // prior cv for lambda
##
## real mu_mn_p1; // hyperprior p1 for mu mean
## real<lower=0> mu_mn_p2; // hyperprior p2 for mu mean
##
## real mu_cv_p1; // hyperprior p1 for mu cv
## real<lower=0> mu_cv_p2; // hyperprior p2 for mu cv
## }
##
## transformed data {
## real<lower=0> r = 1 / (lambda_cv * lambda_cv);
## real<lower=0> alpha = 1 / (lambda_cv * lambda_cv * lambda_mn);
## }
##
##
## parameters {
## real<lower=0> mu_mn;
## real<lower=0> mu_cv;
##
## vector<lower=0>[n] lambda; // purchase rate
## vector<lower=0>[n] mu; // lifetime dropout rate
## }
##
##
## transformed parameters {
## real<lower=0> s;
## real<lower=0> beta;
##
## s = 1 / (mu_cv * mu_cv);
## beta = 1 / (mu_cv * mu_cv * mu_mn);
## }
##
## model {
## // model the hyper-prior
## mu_mn ~ lognormal(mu_mn_p1, mu_mn_p2);
## mu_cv ~ lognormal(mu_cv_p1, mu_cv_p2);
##
## // setting priors
## lambda ~ gamma(r, alpha);
## mu ~ gamma(s, beta);
##
## target += calculate_pnbd_loglik(n, lambda, mu, x, t_x, T_cal);
## }
##
## generated quantities {
## vector[n] p_alive; // Probability that they are still "alive"
##
## p_alive = 1 ./ (1 + mu ./ (mu + lambda) .* (exp((lambda + mu) .* (T_cal - t_x)) - 1));
## }
We now compile this model using CmdStanR.
pnbd_extdata_hiermu_stanmodel <- cmdstan_model(
"stan_code/pnbd_hiermu.stan",
include_paths = stan_codedir,
pedantic = TRUE,
dir = stan_modeldir
)4.1 First Fit Attempt
We then use this compiled model with our data to produce a fit of the data.
stan_modelname <- "pnbd_extdata_hiermu"
stanfit_prefix <- str_c("fit_", stan_modelname)
stan_data_lst <- btyd_fitdata_tbl %>%
select(customer_id, x, t_x, T_cal) %>%
compose_data(
lambda_mn = 1.00,
lambda_cv = 0.75,
mu_mn_p1 = log(0.05) - 0.5 * (0.5)^2,
mu_mn_p2 = 0.5,
mu_cv_p1 = log(0.5) - 0.5 * (0.2)^2,
mu_cv_p2 = 0.2,
)
pnbd_extdata_hiermu_stanfit <- pnbd_extdata_hiermu_stanmodel$sample(
data = stan_data_lst,
chains = 4,
iter_warmup = 500,
iter_sampling = 500,
seed = 4204,
save_warmup = TRUE,
adapt_delta = 0.90,
max_treedepth = 10,
output_dir = stan_modeldir,
output_basename = stanfit_prefix,
)## Running MCMC with 4 parallel chains...
##
## Chain 1 Iteration: 1 / 1000 [ 0%] (Warmup)
## Chain 2 Iteration: 1 / 1000 [ 0%] (Warmup)
## Chain 3 Iteration: 1 / 1000 [ 0%] (Warmup)
## Chain 4 Iteration: 1 / 1000 [ 0%] (Warmup)
## Chain 4 Iteration: 100 / 1000 [ 10%] (Warmup)
## Chain 1 Iteration: 100 / 1000 [ 10%] (Warmup)
## Chain 2 Iteration: 100 / 1000 [ 10%] (Warmup)
## Chain 3 Iteration: 100 / 1000 [ 10%] (Warmup)
## Chain 4 Iteration: 200 / 1000 [ 20%] (Warmup)
## Chain 1 Iteration: 200 / 1000 [ 20%] (Warmup)
## Chain 3 Iteration: 200 / 1000 [ 20%] (Warmup)
## Chain 2 Iteration: 200 / 1000 [ 20%] (Warmup)
## Chain 4 Iteration: 300 / 1000 [ 30%] (Warmup)
## Chain 1 Iteration: 300 / 1000 [ 30%] (Warmup)
## Chain 3 Iteration: 300 / 1000 [ 30%] (Warmup)
## Chain 2 Iteration: 300 / 1000 [ 30%] (Warmup)
## Chain 4 Iteration: 400 / 1000 [ 40%] (Warmup)
## Chain 1 Iteration: 400 / 1000 [ 40%] (Warmup)
## Chain 3 Iteration: 400 / 1000 [ 40%] (Warmup)
## Chain 2 Iteration: 400 / 1000 [ 40%] (Warmup)
## Chain 4 Iteration: 500 / 1000 [ 50%] (Warmup)
## Chain 4 Iteration: 501 / 1000 [ 50%] (Sampling)
## Chain 1 Iteration: 500 / 1000 [ 50%] (Warmup)
## Chain 1 Iteration: 501 / 1000 [ 50%] (Sampling)
## Chain 2 Iteration: 500 / 1000 [ 50%] (Warmup)
## Chain 2 Iteration: 501 / 1000 [ 50%] (Sampling)
## Chain 3 Iteration: 500 / 1000 [ 50%] (Warmup)
## Chain 3 Iteration: 501 / 1000 [ 50%] (Sampling)
## Chain 4 Iteration: 600 / 1000 [ 60%] (Sampling)
## Chain 1 Iteration: 600 / 1000 [ 60%] (Sampling)
## Chain 2 Iteration: 600 / 1000 [ 60%] (Sampling)
## Chain 4 Iteration: 700 / 1000 [ 70%] (Sampling)
## Chain 1 Iteration: 700 / 1000 [ 70%] (Sampling)
## Chain 2 Iteration: 700 / 1000 [ 70%] (Sampling)
## Chain 3 Iteration: 600 / 1000 [ 60%] (Sampling)
## Chain 4 Iteration: 800 / 1000 [ 80%] (Sampling)
## Chain 1 Iteration: 800 / 1000 [ 80%] (Sampling)
## Chain 2 Iteration: 800 / 1000 [ 80%] (Sampling)
## Chain 4 Iteration: 900 / 1000 [ 90%] (Sampling)
## Chain 1 Iteration: 900 / 1000 [ 90%] (Sampling)
## Chain 2 Iteration: 900 / 1000 [ 90%] (Sampling)
## Chain 3 Iteration: 700 / 1000 [ 70%] (Sampling)
## Chain 4 Iteration: 1000 / 1000 [100%] (Sampling)
## Chain 4 finished in 1391.0 seconds.
## Chain 1 Iteration: 1000 / 1000 [100%] (Sampling)
## Chain 1 finished in 1423.8 seconds.
## Chain 2 Iteration: 1000 / 1000 [100%] (Sampling)
## Chain 2 finished in 1428.8 seconds.
## Chain 3 Iteration: 800 / 1000 [ 80%] (Sampling)
## Chain 3 Iteration: 900 / 1000 [ 90%] (Sampling)
## Chain 3 Iteration: 1000 / 1000 [100%] (Sampling)
## Chain 3 finished in 1745.7 seconds.
##
## All 4 chains finished successfully.
## Mean chain execution time: 1497.3 seconds.
## Total execution time: 1746.2 seconds.
pnbd_extdata_hiermu_stanfit$summary()## # A tibble: 14,153 × 10
## variable mean median sd mad q5 q95 rhat ess_bulk
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 lp__ -8.44e+4 -8.44e+4 1.63e+2 1.68e+2 -8.46e+4 -8.41e+4 1.03 73.8
## 2 mu_mn 2.71e+1 2.64e+1 5.48e+0 5.23e+0 1.93e+1 3.71e+1 1.01 151.
## 3 mu_cv 4.06e+0 4.06e+0 7.24e-2 7.63e-2 3.94e+0 4.18e+0 1.02 86.5
## 4 lambda[1] 1.85e-1 1.81e-1 5.11e-2 5.01e-2 1.10e-1 2.72e-1 1.00 2169.
## 5 lambda[2] 1.68e-1 1.55e-1 8.78e-2 7.83e-2 5.67e-2 3.32e-1 1.00 2489.
## 6 lambda[3] 1.35e-1 1.24e-1 6.81e-2 6.37e-2 4.46e-2 2.58e-1 1.00 2379.
## 7 lambda[4] 7.96e-2 7.13e-2 4.47e-2 3.84e-2 2.32e-2 1.61e-1 1.00 2159.
## 8 lambda[5] 8.54e-1 6.98e-1 6.74e-1 6.08e-1 1.01e-1 2.15e+0 1.01 298.
## 9 lambda[6] 9.16e-1 7.49e-1 6.88e-1 5.89e-1 1.29e-1 2.21e+0 1.00 1830.
## 10 lambda[7] 3.56e-1 3.42e-1 1.26e-1 1.26e-1 1.83e-1 5.81e-1 1.00 2608.
## # … with 14,143 more rows, and 1 more variable: ess_tail <dbl>
We have some basic HMC-based validity statistics we can check.
pnbd_extdata_hiermu_stanfit$cmdstan_diagnose()## Processing csv files: /home/rstudio/workshop/stan_models/fit_pnbd_extdata_hiermu-1.csvWarning: non-fatal error reading adaptation data
## , /home/rstudio/workshop/stan_models/fit_pnbd_extdata_hiermu-2.csvWarning: non-fatal error reading adaptation data
## , /home/rstudio/workshop/stan_models/fit_pnbd_extdata_hiermu-3.csvWarning: non-fatal error reading adaptation data
## , /home/rstudio/workshop/stan_models/fit_pnbd_extdata_hiermu-4.csvWarning: non-fatal error reading adaptation data
##
##
## Checking sampler transitions treedepth.
## Treedepth satisfactory for all transitions.
##
## Checking sampler transitions for divergences.
## No divergent transitions found.
##
## Checking E-BFMI - sampler transitions HMC potential energy.
## E-BFMI satisfactory.
##
## The following parameters had fewer than 0.001 effective draws per transition:
## p_alive[2836], p_alive[4075]
## Such low values indicate that the effective sample size estimators may be biased high and actual performance may be substantially lower than quoted.
##
## The following parameters had split R-hat greater than 1.05:
## lambda[1101], lambda[1756], lambda[2836], lambda[4004], lambda[4075], lambda[4464], mu[1101], mu[2836], mu[4075], p_alive[5], p_alive[17], p_alive[38], p_alive[76], p_alive[84], p_alive[87], p_alive[91], p_alive[94], p_alive[123], p_alive[129], p_alive[131], p_alive[145], p_alive[150], p_alive[153], p_alive[166], p_alive[232], p_alive[239], p_alive[257], p_alive[274], p_alive[281], p_alive[299], p_alive[335], p_alive[346], p_alive[357], p_alive[409], p_alive[410], p_alive[443], p_alive[460], p_alive[552], p_alive[580], p_alive[651], p_alive[669], p_alive[674], p_alive[704], p_alive[711], p_alive[719], p_alive[756], p_alive[800], p_alive[815], p_alive[819], p_alive[845], p_alive[862], p_alive[893], p_alive[894], p_alive[904], p_alive[916], p_alive[920], p_alive[921], p_alive[923], p_alive[928], p_alive[1020], p_alive[1026], p_alive[1041], p_alive[1060], p_alive[1073], p_alive[1101], p_alive[1118], p_alive[1136], p_alive[1151], p_alive[1152], p_alive[1211], p_alive[1213], p_alive[1220], p_alive[1244], p_alive[1267], p_alive[1274], p_alive[1283], p_alive[1292], p_alive[1354], p_alive[1365], p_alive[1386], p_alive[1394], p_alive[1410], p_alive[1413], p_alive[1424], p_alive[1452], p_alive[1473], p_alive[1495], p_alive[1497], p_alive[1538], p_alive[1546], p_alive[1576], p_alive[1583], p_alive[1585], p_alive[1592], p_alive[1601], p_alive[1603], p_alive[1613], p_alive[1640], p_alive[1676], p_alive[1688], p_alive[1705], p_alive[1708], p_alive[1747], p_alive[1756], p_alive[1782], p_alive[1813], p_alive[1846], p_alive[1863], p_alive[1890], p_alive[1913], p_alive[1914], p_alive[1956], p_alive[1957], p_alive[1966], p_alive[1967], p_alive[1993], p_alive[2001], p_alive[2018], p_alive[2020], p_alive[2067], p_alive[2077], p_alive[2107], p_alive[2114], p_alive[2117], p_alive[2135], p_alive[2169], p_alive[2197], p_alive[2217], p_alive[2218], p_alive[2296], p_alive[2300], p_alive[2318], p_alive[2319], p_alive[2334], p_alive[2339], p_alive[2349], p_alive[2363], p_alive[2371], p_alive[2386], p_alive[2405], p_alive[2410], p_alive[2456], p_alive[2498], p_alive[2508], p_alive[2555], p_alive[2558], p_alive[2567], p_alive[2602], p_alive[2603], p_alive[2604], p_alive[2632], p_alive[2651], p_alive[2656], p_alive[2700], p_alive[2787], p_alive[2798], p_alive[2801], p_alive[2821], p_alive[2822], p_alive[2836], p_alive[2853], p_alive[2857], p_alive[2878], p_alive[2881], p_alive[2890], p_alive[2898], p_alive[2931], p_alive[2934], p_alive[2952], p_alive[2953], p_alive[2998], p_alive[3000], p_alive[3016], p_alive[3021], p_alive[3027], p_alive[3032], p_alive[3045], p_alive[3054], p_alive[3080], p_alive[3084], p_alive[3085], p_alive[3095], p_alive[3118], p_alive[3123], p_alive[3124], p_alive[3147], p_alive[3179], p_alive[3206], p_alive[3224], p_alive[3247], p_alive[3249], p_alive[3253], p_alive[3266], p_alive[3289], p_alive[3294], p_alive[3316], p_alive[3336], p_alive[3345], p_alive[3349], p_alive[3354], p_alive[3361], p_alive[3390], p_alive[3393], p_alive[3399], p_alive[3400], p_alive[3436], p_alive[3472], p_alive[3488], p_alive[3493], p_alive[3496], p_alive[3527], p_alive[3533], p_alive[3540], p_alive[3551], p_alive[3565], p_alive[3581], p_alive[3619], p_alive[3633], p_alive[3640], p_alive[3652], p_alive[3660], p_alive[3674], p_alive[3697], p_alive[3706], p_alive[3710], p_alive[3711], p_alive[3731], p_alive[3750], p_alive[3752], p_alive[3779], p_alive[3847], p_alive[3886], p_alive[3906], p_alive[3910], p_alive[3936], p_alive[3960], p_alive[3985], p_alive[3991], p_alive[4004], p_alive[4010], p_alive[4023], p_alive[4034], p_alive[4045], p_alive[4075], p_alive[4078], p_alive[4087], p_alive[4105], p_alive[4131], p_alive[4135], p_alive[4141], p_alive[4146], p_alive[4176], p_alive[4195], p_alive[4219], p_alive[4260], p_alive[4266], p_alive[4278], p_alive[4313], p_alive[4322], p_alive[4341], p_alive[4352], p_alive[4366], p_alive[4415], p_alive[4434], p_alive[4442], p_alive[4457], p_alive[4464], p_alive[4488], p_alive[4508], p_alive[4526], p_alive[4530], p_alive[4558], p_alive[4578], p_alive[4601], p_alive[4613], p_alive[4628], p_alive[4633], p_alive[4663], p_alive[4692], p_alive[4696], p_alive[4697], p_alive[4700], p_alive[4709]
## Such high values indicate incomplete mixing and biased estimation.
## You should consider regularizating your model with additional prior information or a more effective parameterization.
##
## Processing complete.
4.1.1 Visual Diagnostics of the Sample Validity
Now that we have a sample from the posterior distribution we need to create a few different visualisations of the diagnostics.
pnbd_extdata_hiermu_stanfit$draws(inc_warmup = FALSE) %>%
mcmc_trace(pars = c("s", "beta", "mu_mn", "mu_cv")) +
expand_limits(y = 0) +
labs(
x = "Iteration",
y = "Value",
title = "Traceplot of Sample of Lambda and Mu Values"
) +
theme(axis.text.x = element_text(size = 10))A common MCMC diagnostic is \(\hat{R}\) - which is a measure of the ‘similarity’ of the chains.
pnbd_extdata_hiermu_stanfit %>%
rhat(pars = c("lambda", "mu", "s", "beta", "mu_mn", "mu_cv")) %>%
mcmc_rhat() +
ggtitle("Plot of Parameter R-hat Values")Related to this quantity is the concept of effective sample size, \(N_{eff}\), an estimate of the size of the sample from a statistical information point of view.
pnbd_extdata_hiermu_stanfit %>%
neff_ratio(pars = c("lambda", "mu", "s", "beta", "mu_mn", "mu_cv")) %>%
mcmc_neff() +
ggtitle("Plot of Parameter Effective Sample Sizes")Finally, we also want to look at autocorrelation in the chains for each parameter.
pnbd_extdata_hiermu_stanfit$draws() %>%
mcmc_acf(pars = c("s", "beta", "mu_mn", "mu_cv")) +
ggtitle("Autocorrelation Plot of Sample Values")As before, this first fit has a comprehensive run of fit diagnostics, but for the sake of brevity in later models we will show only the traceplots once we are satisfied with the validity of the sample.
4.1.2 Validate the Hier-Mu Model Fit
We now want to validate this model by using our simulation technique.
We first extract our posterior by customer, and use this as the basis of our simulations.
pnbd_extdata_hiermu_validation_tbl <- pnbd_extdata_hiermu_stanfit %>%
recover_types(btyd_fitdata_tbl) %>%
spread_draws(lambda[customer_id], mu[customer_id], p_alive[customer_id]) %>%
ungroup() %>%
select(
customer_id, draw_id = .draw, post_lambda = lambda, post_mu = mu, p_alive
)
pnbd_extdata_hiermu_validation_tbl %>% glimpse()## Rows: 9,432,000
## Columns: 5
## $ customer_id [3m[38;5;246m<chr>[39m[23m "12346", "12346", "12346", "12346", "12346", "12346", "123…
## $ draw_id [3m[38;5;246m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ post_lambda [3m[38;5;246m<dbl>[39m[23m 0.1345560, 0.0973957, 0.1681480, 0.1261070, 0.1733330, 0.2…
## $ post_mu [3m[38;5;246m<dbl>[39m[23m 8.91034e-11, 1.47308e-10, 6.25425e-03, 5.82612e-16, 7.4343…
## $ p_alive [3m[38;5;246m<dbl>[39m[23m 1.000000, 1.000000, 0.845374, 1.000000, 0.815074, 1.000000…
We now see there are a number of validity issues with our sample, and this needs some exploration.
Let’s start by looking at the hierarchical parameters mu_mn, mu_cv, s and
beta.
pnbd_extdata_hiermu_noncustomer_tbl <- pnbd_extdata_hiermu_stanfit %>%
tidy_draws() %>%
select(!contains("["))
plot_tbl <- pnbd_extdata_hiermu_noncustomer_tbl %>%
select(.draw, mu_mn, mu_cv, s, beta, lp__, energy__) %>%
pivot_longer(cols = !.draw, names_to = "param", values_to = "value")
ggplot(plot_tbl) +
geom_histogram(aes(x = value), bins = 50) +
facet_wrap(vars(param), scales = "free_x") +
labs(
x = "Value",
y = "Frequency",
title = "Histograms of Hierarchical and Diagnostic Quantities"
)We see from these histograms that the hierarchical parameters mu_mn and
mu_cv looks to be way too high, so there is an issue here.
We now should look at the \(\mu\) values for individual customers. In each case we have a distribution for each customer, so we start by looking at the median value of each customer. We could also check the mean values, but these distributions may be skewed, so we may get a better picture from using the median value.
pnbd_extdata_hiermu_summvals_tbl <- pnbd_extdata_hiermu_validation_tbl %>%
group_by(customer_id) %>%
summarise(
.groups = "drop",
mean_mu = mean(post_mu),
mean_lambda = mean(post_lambda),
mean_p_alive = mean(p_alive),
median_mu = median(post_mu),
median_lambda = median(post_lambda),
median_pa = median(p_alive)
)
plot_tbl <- pnbd_extdata_hiermu_summvals_tbl %>%
pivot_longer(
cols = !customer_id,
names_to = "param",
values_to = "value"
)
ggplot(plot_tbl) +
geom_histogram(aes(x = value), bins = 50) +
facet_wrap(vars(param), scales = "free_x") +
labs(
x = "Value",
y = "Frequency",
title = "Histograms of Customer Quantities"
)We see the posterior is showing a strong multi-modality for all our parameters so we want to check both \(\mu\) and \(\lambda\) together.
ggplot(pnbd_extdata_hiermu_summvals_tbl) +
geom_point(aes(x = median_mu, median_lambda)) +
labs(
x = "Mean Mu",
y = "Mean Lambda",
title = "Scatterplot of Posterior Median of Lambda and Mu"
)We see there is a cohort of customers that have a very short lifetime (high values of \(\mu\) as the mean lifetime is \(1 / \mu\)).
We can also see that the data induces a conditional dependency on a customers value of \(\mu\) and \(\lambda\).
It may be best to visualise the transactions of these customers with extreme values of \(\mu\), so we focus purely on the customers that have a median value of \(\mu\) above 10.
filter_custs_tbl <- pnbd_extdata_hiermu_summvals_tbl %>%
filter(median_mu > 10)
extreme_tnx_tbl <- customer_transactions_tbl %>%
filter(tnx_timestamp < as.POSIXct("2011-04-01")) %>%
semi_join(filter_custs_tbl, by = "customer_id") %>%
group_by(customer_id) %>%
mutate(tnx_first = min(tnx_timestamp)) %>%
ungroup() %>%
group_by(tnx_first, customer_id) %>%
mutate(
cust_rank = cur_group_id(),
tnx_count = n()
) %>%
ungroup()
ggplot(
extreme_tnx_tbl,
aes(x = tnx_timestamp, y = cust_rank, group = cust_rank, colour = as.character(tnx_count))
) +
geom_line() +
geom_point() +
labs(
x = "Transaction Time",
y = "Customer Rank",
colour = "Tnx Count",
title = "Plot of Individual Customer Transactions of Extreme Customers"
)There are many customers in this transaction so we focus on the earliest 75 customers to get a closer look.
ggplot(
extreme_tnx_tbl %>% filter(cust_rank <= 75),
aes(x = tnx_timestamp, y = cust_rank, group = cust_rank, colour = as.character(tnx_count))
) +
geom_line() +
geom_point() +
labs(
x = "Transaction Time",
y = "Customer Rank",
colour = "Tnx Count",
title = "Plot of Individual Customer Transactions of Extreme Customers"
)We see that there are a number of customers that transact in a short period of time after they become active and then stop. This cohort of customers is likely the source of our multi-modality, so we first focus on removing those from our initial dataset and see what effect this has on our fit.
extreme_customers_tbl <- btyd_fitdata_tbl %>%
filter(x == 0 | t_x < 0.15)
btyd_trimmed_fitdata_tbl <- btyd_fitdata_tbl %>%
anti_join(extreme_customers_tbl, by = "customer_id")4.2 Second Fit Attempt
Before we remove the data from this model, we instead try to refit with the
same amount of data, but with a much tighter prior on mu_cv.
stan_modelname <- "pnbd_extdata_hiermu2"
stanfit_prefix <- str_c("fit_", stan_modelname)
stan_data_lst <- btyd_fitdata_tbl %>%
select(customer_id, x, t_x, T_cal) %>%
compose_data(
lambda_mn = 1.00,
lambda_cv = 0.75,
mu_mn_p1 = log(0.05) - 0.5 * (0.5)^2,
mu_mn_p2 = 0.5,
mu_cv_p1 = log(0.5) - 0.5 * (0.01)^2,
mu_cv_p2 = 0.01,
)
pnbd_extdata_hiermu2_stanfit <- pnbd_extdata_hiermu_stanmodel$sample(
data = stan_data_lst,
chains = 4,
iter_warmup = 500,
iter_sampling = 500,
seed = 4205,
save_warmup = TRUE,
adapt_delta = 0.90,
max_treedepth = 10,
output_dir = stan_modeldir,
output_basename = stanfit_prefix,
)## Running MCMC with 4 parallel chains...
##
## Chain 1 Iteration: 1 / 1000 [ 0%] (Warmup)
## Chain 2 Iteration: 1 / 1000 [ 0%] (Warmup)
## Chain 3 Iteration: 1 / 1000 [ 0%] (Warmup)
## Chain 4 Iteration: 1 / 1000 [ 0%] (Warmup)
## Chain 4 Iteration: 100 / 1000 [ 10%] (Warmup)
## Chain 1 Iteration: 100 / 1000 [ 10%] (Warmup)
## Chain 4 Iteration: 200 / 1000 [ 20%] (Warmup)
## Chain 3 Iteration: 100 / 1000 [ 10%] (Warmup)
## Chain 2 Iteration: 100 / 1000 [ 10%] (Warmup)
## Chain 4 Iteration: 300 / 1000 [ 30%] (Warmup)
## Chain 1 Iteration: 200 / 1000 [ 20%] (Warmup)
## Chain 3 Iteration: 200 / 1000 [ 20%] (Warmup)
## Chain 1 Iteration: 300 / 1000 [ 30%] (Warmup)
## Chain 4 Iteration: 400 / 1000 [ 40%] (Warmup)
## Chain 2 Iteration: 200 / 1000 [ 20%] (Warmup)
## Chain 3 Iteration: 300 / 1000 [ 30%] (Warmup)
## Chain 1 Iteration: 400 / 1000 [ 40%] (Warmup)
## Chain 4 Iteration: 500 / 1000 [ 50%] (Warmup)
## Chain 4 Iteration: 501 / 1000 [ 50%] (Sampling)
## Chain 2 Iteration: 300 / 1000 [ 30%] (Warmup)
## Chain 3 Iteration: 400 / 1000 [ 40%] (Warmup)
## Chain 1 Iteration: 500 / 1000 [ 50%] (Warmup)
## Chain 1 Iteration: 501 / 1000 [ 50%] (Sampling)
## Chain 2 Iteration: 400 / 1000 [ 40%] (Warmup)
## Chain 4 Iteration: 600 / 1000 [ 60%] (Sampling)
## Chain 3 Iteration: 500 / 1000 [ 50%] (Warmup)
## Chain 3 Iteration: 501 / 1000 [ 50%] (Sampling)
## Chain 1 Iteration: 600 / 1000 [ 60%] (Sampling)
## Chain 1 Iteration: 700 / 1000 [ 70%] (Sampling)
## Chain 2 Iteration: 500 / 1000 [ 50%] (Warmup)
## Chain 2 Iteration: 501 / 1000 [ 50%] (Sampling)
## Chain 4 Iteration: 700 / 1000 [ 70%] (Sampling)
## Chain 3 Iteration: 600 / 1000 [ 60%] (Sampling)
## Chain 1 Iteration: 800 / 1000 [ 80%] (Sampling)
## Chain 1 Iteration: 900 / 1000 [ 90%] (Sampling)
## Chain 2 Iteration: 600 / 1000 [ 60%] (Sampling)
## Chain 4 Iteration: 800 / 1000 [ 80%] (Sampling)
## Chain 3 Iteration: 700 / 1000 [ 70%] (Sampling)
## Chain 1 Iteration: 1000 / 1000 [100%] (Sampling)
## Chain 1 finished in 321.8 seconds.
## Chain 2 Iteration: 700 / 1000 [ 70%] (Sampling)
## Chain 3 Iteration: 800 / 1000 [ 80%] (Sampling)
## Chain 4 Iteration: 900 / 1000 [ 90%] (Sampling)
## Chain 2 Iteration: 800 / 1000 [ 80%] (Sampling)
## Chain 3 Iteration: 900 / 1000 [ 90%] (Sampling)
## Chain 4 Iteration: 1000 / 1000 [100%] (Sampling)
## Chain 4 finished in 357.5 seconds.
## Chain 2 Iteration: 900 / 1000 [ 90%] (Sampling)
## Chain 3 Iteration: 1000 / 1000 [100%] (Sampling)
## Chain 3 finished in 372.0 seconds.
## Chain 2 Iteration: 1000 / 1000 [100%] (Sampling)
## Chain 2 finished in 387.3 seconds.
##
## All 4 chains finished successfully.
## Mean chain execution time: 359.6 seconds.
## Total execution time: 387.7 seconds.
pnbd_extdata_hiermu2_stanfit$summary()## # A tibble: 14,153 × 10
## variable mean median sd mad q5 q95 rhat ess_bulk
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 lp__ -7.53e+4 -7.53e+4 8.47e+1 8.51e+1 -7.54e+4 -7.51e+4 1.00 579.
## 2 mu_mn 2.30e-2 2.30e-2 6.36e-4 6.40e-4 2.20e-2 2.41e-2 1.03 189.
## 3 mu_cv 5.09e-1 5.09e-1 5.16e-3 5.08e-3 5.01e-1 5.18e-1 1.00 933.
## 4 lambda[1] 1.93e-1 1.86e-1 5.34e-2 5.50e-2 1.18e-1 2.87e-1 1.00 3445.
## 5 lambda[2] 1.87e-1 1.67e-1 1.01e-1 8.63e-2 5.98e-2 3.80e-1 1.00 3993.
## 6 lambda[3] 1.46e-1 1.33e-1 7.46e-2 6.68e-2 5.11e-2 2.89e-1 1.00 3335.
## 7 lambda[4] 1.01e-1 8.99e-2 5.79e-2 5.08e-2 3.10e-2 2.10e-1 1.00 3422.
## 8 lambda[5] 3.48e-1 2.19e-1 3.87e-1 1.89e-1 4.42e-2 1.14e+0 1.00 2825.
## 9 lambda[6] 4.58e-1 2.73e-1 5.17e-1 2.91e-1 2.73e-2 1.53e+0 1.00 2982.
## 10 lambda[7] 3.59e-1 3.42e-1 1.34e-1 1.28e-1 1.71e-1 6.03e-1 1.01 4636.
## # … with 14,143 more rows, and 1 more variable: ess_tail <dbl>
We have some basic HMC-based validity statistics we can check.
pnbd_extdata_hiermu2_stanfit$cmdstan_diagnose()## Processing csv files: /home/rstudio/workshop/stan_models/fit_pnbd_extdata_hiermu2-1.csvWarning: non-fatal error reading adaptation data
## , /home/rstudio/workshop/stan_models/fit_pnbd_extdata_hiermu2-2.csvWarning: non-fatal error reading adaptation data
## , /home/rstudio/workshop/stan_models/fit_pnbd_extdata_hiermu2-3.csvWarning: non-fatal error reading adaptation data
## , /home/rstudio/workshop/stan_models/fit_pnbd_extdata_hiermu2-4.csvWarning: non-fatal error reading adaptation data
##
##
## Checking sampler transitions treedepth.
## Treedepth satisfactory for all transitions.
##
## Checking sampler transitions for divergences.
## No divergent transitions found.
##
## Checking E-BFMI - sampler transitions HMC potential energy.
## E-BFMI satisfactory.
##
## Effective sample size satisfactory.
##
## Split R-hat values satisfactory all parameters.
##
## Processing complete, no problems detected.
4.2.1 Visual Diagnostics of the Sample Validity
Now that we have a sample from the posterior distribution we need to create a few different visualisations of the diagnostics.
pnbd_extdata_hiermu2_stanfit$draws(inc_warmup = FALSE) %>%
mcmc_trace(pars = c("s", "beta", "mu_mn", "mu_cv")) +
expand_limits(y = 0) +
labs(
x = "Iteration",
y = "Value",
title = "Traceplot of Sample of Lambda and Mu Values"
) +
theme(axis.text.x = element_text(size = 10))A common MCMC diagnostic is \(\hat{R}\) - which is a measure of the ‘similarity’ of the chains.
pnbd_extdata_hiermu2_stanfit %>%
rhat(pars = c("lambda", "mu", "s", "beta", "mu_mn", "mu_cv")) %>%
mcmc_rhat() +
ggtitle("Plot of Parameter R-hat Values")Related to this quantity is the concept of effective sample size, \(N_{eff}\), an estimate of the size of the sample from a statistical information point of view.
pnbd_extdata_hiermu2_stanfit %>%
neff_ratio(pars = c("lambda", "mu", "s", "beta", "mu_mn", "mu_cv")) %>%
mcmc_neff() +
ggtitle("Plot of Parameter Effective Sample Sizes")Finally, we also want to look at autocorrelation in the chains for each parameter.
pnbd_extdata_hiermu2_stanfit$draws() %>%
mcmc_acf(pars = c("s", "beta", "mu_mn", "mu_cv")) +
ggtitle("Autocorrelation Plot of Sample Values")As before, this first fit has a comprehensive run of fit diagnostics, but for the sake of brevity in later models we will show only the traceplots once we are satisfied with the validity of the sample.
4.2.2 Validate the Hier-Mu Model Fit
We now want to validate this model by using our simulation technique.
We first extract our posterior by customer, and use this as the basis of our simulations.
pnbd_extdata_hiermu2_validation_tbl <- pnbd_extdata_hiermu2_stanfit %>%
recover_types(btyd_fitdata_tbl) %>%
spread_draws(lambda[customer_id], mu[customer_id], p_alive[customer_id]) %>%
ungroup() %>%
select(
customer_id, draw_id = .draw, post_lambda = lambda, post_mu = mu, p_alive
)
pnbd_extdata_hiermu2_validation_tbl %>% glimpse()## Rows: 9,432,000
## Columns: 5
## $ customer_id [3m[38;5;246m<chr>[39m[23m "12346", "12346", "12346", "12346", "12346", "12346", "123…
## $ draw_id [3m[38;5;246m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ post_lambda [3m[38;5;246m<dbl>[39m[23m 0.224872, 0.233108, 0.125107, 0.161266, 0.227044, 0.160325…
## $ post_mu [3m[38;5;246m<dbl>[39m[23m 0.02312050, 0.02366880, 0.01665670, 0.01583930, 0.01345250…
## $ p_alive [3m[38;5;246m<dbl>[39m[23m 0.470226, 0.448648, 0.717622, 0.679502, 0.616817, 0.555035…
pnbd_extdata_hiermu2_noncustomer_tbl <- pnbd_extdata_hiermu2_stanfit %>%
tidy_draws() %>%
select(!contains("["))
plot_tbl <- pnbd_extdata_hiermu2_noncustomer_tbl %>%
select(.draw, mu_mn, mu_cv, s, beta, lp__, energy__) %>%
pivot_longer(cols = !.draw, names_to = "param", values_to = "value")
ggplot(plot_tbl) +
geom_histogram(aes(x = value), bins = 50) +
facet_wrap(vars(param), scales = "free_x") +
labs(
x = "Value",
y = "Frequency",
title = "Histograms of Hierarchical and Diagnostic Quantities"
)pnbd_extdata_hiermu2_summvals_tbl <- pnbd_extdata_hiermu2_validation_tbl %>%
group_by(customer_id) %>%
summarise(
.groups = "drop",
mean_mu = mean(post_mu),
mean_lambda = mean(post_lambda),
mean_p_alive = mean(p_alive),
median_mu = median(post_mu),
median_lambda = median(post_lambda),
median_pa = median(p_alive)
)
plot_tbl <- pnbd_extdata_hiermu2_summvals_tbl %>%
pivot_longer(
cols = !customer_id,
names_to = "param",
values_to = "value"
)
ggplot(plot_tbl) +
geom_histogram(aes(x = value), bins = 50) +
facet_wrap(vars(param), scales = "free_x") +
labs(
x = "Value",
y = "Frequency",
title = "Histograms of Customer Quantities"
)ggplot(pnbd_extdata_hiermu2_summvals_tbl) +
geom_point(aes(x = median_mu, median_lambda)) +
labs(
x = "Mean Mu",
y = "Mean Lambda",
title = "Scatterplot of Posterior Median of Lambda and Mu"
)4.3 Third Fit Attempt
Before we remove the data from this model, we instead try to refit with the
same amount of data, but with a much tighter prior on mu_cv.
stan_modelname <- "pnbd_extdata_hiermu3"
stanfit_prefix <- str_c("fit_", stan_modelname)
stan_data_lst <- btyd_trimmed_fitdata_tbl %>%
select(customer_id, x, t_x, T_cal) %>%
compose_data(
lambda_mn = 1.00,
lambda_cv = 0.75,
mu_mn_p1 = log(0.05) - 0.5 * (0.5)^2,
mu_mn_p2 = 0.5,
mu_cv_p1 = log(0.5) - 0.5 * (0.05)^2,
mu_cv_p2 = 0.05,
)
pnbd_extdata_hiermu3_stanfit <- pnbd_extdata_hiermu_stanmodel$sample(
data = stan_data_lst,
chains = 4,
iter_warmup = 500,
iter_sampling = 500,
seed = 4205,
save_warmup = TRUE,
adapt_delta = 0.90,
max_treedepth = 10,
output_dir = stan_modeldir,
output_basename = stanfit_prefix,
)## Running MCMC with 4 parallel chains...
##
## Chain 1 Iteration: 1 / 1000 [ 0%] (Warmup)
## Chain 2 Iteration: 1 / 1000 [ 0%] (Warmup)
## Chain 3 Iteration: 1 / 1000 [ 0%] (Warmup)
## Chain 4 Iteration: 1 / 1000 [ 0%] (Warmup)
## Chain 4 Iteration: 100 / 1000 [ 10%] (Warmup)
## Chain 3 Iteration: 100 / 1000 [ 10%] (Warmup)
## Chain 2 Iteration: 100 / 1000 [ 10%] (Warmup)
## Chain 1 Iteration: 100 / 1000 [ 10%] (Warmup)
## Chain 4 Iteration: 200 / 1000 [ 20%] (Warmup)
## Chain 3 Iteration: 200 / 1000 [ 20%] (Warmup)
## Chain 2 Iteration: 200 / 1000 [ 20%] (Warmup)
## Chain 1 Iteration: 200 / 1000 [ 20%] (Warmup)
## Chain 4 Iteration: 300 / 1000 [ 30%] (Warmup)
## Chain 3 Iteration: 300 / 1000 [ 30%] (Warmup)
## Chain 2 Iteration: 300 / 1000 [ 30%] (Warmup)
## Chain 1 Iteration: 300 / 1000 [ 30%] (Warmup)
## Chain 4 Iteration: 400 / 1000 [ 40%] (Warmup)
## Chain 3 Iteration: 400 / 1000 [ 40%] (Warmup)
## Chain 2 Iteration: 400 / 1000 [ 40%] (Warmup)
## Chain 1 Iteration: 400 / 1000 [ 40%] (Warmup)
## Chain 4 Iteration: 500 / 1000 [ 50%] (Warmup)
## Chain 4 Iteration: 501 / 1000 [ 50%] (Sampling)
## Chain 3 Iteration: 500 / 1000 [ 50%] (Warmup)
## Chain 3 Iteration: 501 / 1000 [ 50%] (Sampling)
## Chain 2 Iteration: 500 / 1000 [ 50%] (Warmup)
## Chain 2 Iteration: 501 / 1000 [ 50%] (Sampling)
## Chain 1 Iteration: 500 / 1000 [ 50%] (Warmup)
## Chain 4 Iteration: 600 / 1000 [ 60%] (Sampling)
## Chain 1 Iteration: 501 / 1000 [ 50%] (Sampling)
## Chain 3 Iteration: 600 / 1000 [ 60%] (Sampling)
## Chain 2 Iteration: 600 / 1000 [ 60%] (Sampling)
## Chain 4 Iteration: 700 / 1000 [ 70%] (Sampling)
## Chain 3 Iteration: 700 / 1000 [ 70%] (Sampling)
## Chain 2 Iteration: 700 / 1000 [ 70%] (Sampling)
## Chain 1 Iteration: 600 / 1000 [ 60%] (Sampling)
## Chain 4 Iteration: 800 / 1000 [ 80%] (Sampling)
## Chain 3 Iteration: 800 / 1000 [ 80%] (Sampling)
## Chain 2 Iteration: 800 / 1000 [ 80%] (Sampling)
## Chain 4 Iteration: 900 / 1000 [ 90%] (Sampling)
## Chain 3 Iteration: 900 / 1000 [ 90%] (Sampling)
## Chain 2 Iteration: 900 / 1000 [ 90%] (Sampling)
## Chain 1 Iteration: 700 / 1000 [ 70%] (Sampling)
## Chain 4 Iteration: 1000 / 1000 [100%] (Sampling)
## Chain 4 finished in 153.5 seconds.
## Chain 3 Iteration: 1000 / 1000 [100%] (Sampling)
## Chain 3 finished in 155.0 seconds.
## Chain 2 Iteration: 1000 / 1000 [100%] (Sampling)
## Chain 2 finished in 156.1 seconds.
## Chain 1 Iteration: 800 / 1000 [ 80%] (Sampling)
## Chain 1 Iteration: 900 / 1000 [ 90%] (Sampling)
## Chain 1 Iteration: 1000 / 1000 [100%] (Sampling)
## Chain 1 finished in 183.6 seconds.
##
## All 4 chains finished successfully.
## Mean chain execution time: 162.0 seconds.
## Total execution time: 184.0 seconds.
pnbd_extdata_hiermu3_stanfit$summary()## # A tibble: 9,242 × 10
## variable mean median sd mad q5 q95 rhat ess_bulk
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 lp__ -6.46e+4 -6.46e+4 1.78e+2 1.80e+2 -6.49e+4 -6.43e+4 1.05 82.0
## 2 mu_mn 6.88e-3 6.87e-3 3.05e-4 3.08e-4 6.38e-3 7.37e-3 1.04 91.3
## 3 mu_cv 4.94e-1 4.94e-1 2.58e-2 2.57e-2 4.52e-1 5.36e-1 1.05 84.0
## 4 lambda[1] 1.89e-1 1.83e-1 5.39e-2 5.16e-2 1.08e-1 2.80e-1 1.00 3002.
## 5 lambda[2] 1.71e-1 1.57e-1 9.25e-2 8.48e-2 5.50e-2 3.58e-1 1.00 2359.
## 6 lambda[3] 1.39e-1 1.25e-1 7.45e-2 6.83e-2 4.48e-2 2.76e-1 1.00 2360.
## 7 lambda[4] 8.77e-2 7.89e-2 4.78e-2 4.19e-2 2.76e-2 1.77e-1 1.00 2213.
## 8 lambda[5] 3.60e-1 3.44e-1 1.28e-1 1.21e-1 1.78e-1 5.81e-1 1.00 3477.
## 9 lambda[6] 1.98e-1 1.81e-1 9.70e-2 8.74e-2 7.42e-2 3.78e-1 0.999 2403.
## 10 lambda[7] 5.44e-2 4.87e-2 2.92e-2 2.68e-2 1.68e-2 1.10e-1 1.00 1822.
## # … with 9,232 more rows, and 1 more variable: ess_tail <dbl>
We have some basic HMC-based validity statistics we can check.
pnbd_extdata_hiermu3_stanfit$cmdstan_diagnose()## Processing csv files: /home/rstudio/workshop/stan_models/fit_pnbd_extdata_hiermu3-1.csvWarning: non-fatal error reading adaptation data
## , /home/rstudio/workshop/stan_models/fit_pnbd_extdata_hiermu3-2.csvWarning: non-fatal error reading adaptation data
## , /home/rstudio/workshop/stan_models/fit_pnbd_extdata_hiermu3-3.csvWarning: non-fatal error reading adaptation data
## , /home/rstudio/workshop/stan_models/fit_pnbd_extdata_hiermu3-4.csvWarning: non-fatal error reading adaptation data
##
##
## Checking sampler transitions treedepth.
## Treedepth satisfactory for all transitions.
##
## Checking sampler transitions for divergences.
## No divergent transitions found.
##
## Checking E-BFMI - sampler transitions HMC potential energy.
## The E-BFMI, 0.18, is below the nominal threshold of 0.30 which suggests that HMC may have trouble exploring the target distribution.
## If possible, try to reparameterize the model.
##
## Effective sample size satisfactory.
##
## The following parameters had split R-hat greater than 1.05:
## mu_cv, beta
## Such high values indicate incomplete mixing and biased estimation.
## You should consider regularizating your model with additional prior information or a more effective parameterization.
##
## Processing complete.
4.3.1 Visual Diagnostics of the Sample Validity
Now that we have a sample from the posterior distribution we need to create a few different visualisations of the diagnostics.
pnbd_extdata_hiermu3_stanfit$draws(inc_warmup = FALSE) %>%
mcmc_trace(pars = c("s", "beta", "mu_mn", "mu_cv")) +
expand_limits(y = 0) +
labs(
x = "Iteration",
y = "Value",
title = "Traceplot of Sample of Lambda and Mu Values"
) +
theme(axis.text.x = element_text(size = 10))A common MCMC diagnostic is \(\hat{R}\) - which is a measure of the ‘similarity’ of the chains.
pnbd_extdata_hiermu3_stanfit %>%
rhat(pars = c("lambda", "mu", "s", "beta", "mu_mn", "mu_cv")) %>%
mcmc_rhat() +
ggtitle("Plot of Parameter R-hat Values")Related to this quantity is the concept of effective sample size, \(N_{eff}\), an estimate of the size of the sample from a statistical information point of view.
pnbd_extdata_hiermu3_stanfit %>%
neff_ratio(pars = c("lambda", "mu", "s", "beta", "mu_mn", "mu_cv")) %>%
mcmc_neff() +
ggtitle("Plot of Parameter Effective Sample Sizes")Finally, we also want to look at autocorrelation in the chains for each parameter.
pnbd_extdata_hiermu3_stanfit$draws() %>%
mcmc_acf(pars = c("s", "beta", "mu_mn", "mu_cv")) +
ggtitle("Autocorrelation Plot of Sample Values")As before, this first fit has a comprehensive run of fit diagnostics, but for the sake of brevity in later models we will show only the traceplots once we are satisfied with the validity of the sample.
4.3.2 Validate the Hier-Mu Model Fit
We now want to validate this model by using our simulation technique.
We first extract our posterior by customer, and use this as the basis of our simulations.
pnbd_extdata_hiermu3_validation_tbl <- pnbd_extdata_hiermu3_stanfit %>%
recover_types(btyd_trimmed_fitdata_tbl) %>%
spread_draws(lambda[customer_id], mu[customer_id], p_alive[customer_id]) %>%
ungroup() %>%
select(
customer_id, draw_id = .draw, post_lambda = lambda, post_mu = mu, p_alive
)
pnbd_extdata_hiermu3_validation_tbl %>% glimpse()## Rows: 6,158,000
## Columns: 5
## $ customer_id [3m[38;5;246m<chr>[39m[23m "12346", "12346", "12346", "12346", "12346", "12346", "123…
## $ draw_id [3m[38;5;246m<int>[39m[23m 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ post_lambda [3m[38;5;246m<dbl>[39m[23m 0.2790190, 0.1937530, 0.1745190, 0.2303010, 0.1838380, 0.1…
## $ post_mu [3m[38;5;246m<dbl>[39m[23m 0.00635773, 0.00927036, 0.00408082, 0.00401887, 0.00315427…
## $ p_alive [3m[38;5;246m<dbl>[39m[23m 0.710609, 0.752366, 0.890677, 0.849185, 0.908771, 0.787225…
pnbd_extdata_hiermu3_noncustomer_tbl <- pnbd_extdata_hiermu3_stanfit %>%
tidy_draws() %>%
select(!contains("["))
plot_tbl <- pnbd_extdata_hiermu3_noncustomer_tbl %>%
select(.draw, mu_mn, mu_cv, s, beta, lp__, energy__) %>%
pivot_longer(cols = !.draw, names_to = "param", values_to = "value")
ggplot(plot_tbl) +
geom_histogram(aes(x = value), bins = 50) +
facet_wrap(vars(param), scales = "free_x") +
labs(
x = "Value",
y = "Frequency",
title = "Histograms of Hierarchical and Diagnostic Quantities"
)pnbd_extdata_hiermu3_summvals_tbl <- pnbd_extdata_hiermu3_validation_tbl %>%
group_by(customer_id) %>%
summarise(
.groups = "drop",
mean_mu = mean(post_mu),
mean_lambda = mean(post_lambda),
mean_p_alive = mean(p_alive),
median_mu = median(post_mu),
median_lambda = median(post_lambda),
median_pa = median(p_alive)
)
plot_tbl <- pnbd_extdata_hiermu3_summvals_tbl %>%
pivot_longer(
cols = !customer_id,
names_to = "param",
values_to = "value"
)
ggplot(plot_tbl) +
geom_histogram(aes(x = value), bins = 50) +
facet_wrap(vars(param), scales = "free_x") +
labs(
x = "Value",
y = "Frequency",
title = "Histograms of Customer Quantities"
)ggplot(pnbd_extdata_hiermu3_summvals_tbl) +
geom_point(aes(x = median_mu, median_lambda)) +
labs(
x = "Mean Mu",
y = "Mean Lambda",
title = "Scatterplot of Posterior Median of Lambda and Mu"
)5 Fit Two-Mean Prior Model
We now attempt to add a prior to both the \(\mu\) and \(\lambda\) hyper-priors.
This is unlikely to work as the additional freedom of the model is likely to re-introduce degeneracies into this model, but it is worth the attempt.
## functions {
## #include util_functions.stan
## }
##
## data {
## int<lower=1> n; // number of customers
##
## vector<lower=0>[n] t_x; // time to most recent purchase
## vector<lower=0>[n] T_cal; // total observation time
## vector<lower=0>[n] x; // number of purchases observed
##
## real<lower=0> lambda_cv; // prior cv for lambda
## real<lower=0> mu_cv; // prior cv for mu
##
## real lambda_mn_p1; // hyperprior p1 for lambda mean
## real<lower=0> lambda_mn_p2; // hyperprior p2 for lambda mean
##
## real mu_mn_p1; // hyperprior p1 for mu mean
## real<lower=0> mu_mn_p2; // hyperprior p2 for mu mean
## }
##
##
## parameters {
## real<lower=0> lambda_mn;
## real<lower=0> mu_mn;
##
## vector<lower=0>[n] lambda; // purchase rate
## vector<lower=0>[n] mu; // lifetime dropout rate
## }
##
## transformed parameters {
## real<lower=0> r = 1 / (lambda_cv * lambda_cv);
## real<lower=0> alpha = 1 / (lambda_cv * lambda_cv * lambda_mn);
##
## real<lower=0> s = 1 / (mu_cv * mu_cv);
## real<lower=0> beta = 1 / (mu_cv * mu_cv * mu_mn);
## }
##
## model {
## // model the hyper-prior
## lambda_mn ~ lognormal(lambda_mn_p1, lambda_mn_p2);
## mu_mn ~ lognormal(mu_mn_p1, mu_mn_p2);
##
## // setting priors
## lambda ~ gamma(r, alpha);
## mu ~ gamma(s, beta);
##
## target += calculate_pnbd_loglik(n, lambda, mu, x, t_x, T_cal);
## }
##
## generated quantities {
## vector[n] p_alive; // Probability that they are still "alive"
##
## p_alive = 1 ./ (1 + mu ./ (mu + lambda) .* (exp((lambda + mu) .* (T_cal - t_x)) - 1));
## }
We now compile this model using CmdStanR.
pnbd_extdata_hiermeans_stanmodel <- cmdstan_model(
"stan_code/pnbd_hiermeans.stan",
include_paths = stan_codedir,
pedantic = TRUE,
dir = stan_modeldir
)5.1 Run the Stan Model on the Data
We then use this compiled model with our data to produce a fit of the data.
stan_modelname <- "pnbd_extdata_hiermeans"
stanfit_prefix <- str_c("fit_", stan_modelname)
stan_data_lst <- btyd_fitdata_tbl %>%
select(customer_id, x, t_x, T_cal) %>%
compose_data(
lambda_mn_p1 = log(1) - 0.5 * (0.5)^2,
lambda_mn_p2 = 0.5,
mu_mn_p1 = log(0.05) - 0.5 * (0.1)^2,
mu_mn_p2 = 0.1,
lambda_cv = 0.75,
mu_cv = 0.1
)
pnbd_extdata_hiermeans_stanfit <- pnbd_extdata_hiermeans_stanmodel$sample(
data = stan_data_lst,
chains = 4,
iter_warmup = 500,
iter_sampling = 500,
seed = 4206,
save_warmup = TRUE,
# adapt_delta = 0.90,
# max_treedepth = 10,
output_dir = stan_modeldir,
output_basename = stanfit_prefix,
)## Running MCMC with 4 parallel chains...
##
## Chain 1 Iteration: 1 / 1000 [ 0%] (Warmup)
## Chain 2 Iteration: 1 / 1000 [ 0%] (Warmup)
## Chain 3 Iteration: 1 / 1000 [ 0%] (Warmup)
## Chain 4 Iteration: 1 / 1000 [ 0%] (Warmup)
## Chain 1 Iteration: 100 / 1000 [ 10%] (Warmup)
## Chain 4 Iteration: 100 / 1000 [ 10%] (Warmup)
## Chain 2 Iteration: 100 / 1000 [ 10%] (Warmup)
## Chain 3 Iteration: 100 / 1000 [ 10%] (Warmup)
## Chain 4 Iteration: 200 / 1000 [ 20%] (Warmup)
## Chain 1 Iteration: 200 / 1000 [ 20%] (Warmup)
## Chain 2 Iteration: 200 / 1000 [ 20%] (Warmup)
## Chain 3 Iteration: 200 / 1000 [ 20%] (Warmup)
## Chain 4 Iteration: 300 / 1000 [ 30%] (Warmup)
## Chain 1 Iteration: 300 / 1000 [ 30%] (Warmup)
## Chain 4 Iteration: 400 / 1000 [ 40%] (Warmup)
## Chain 2 Iteration: 300 / 1000 [ 30%] (Warmup)
## Chain 1 Iteration: 400 / 1000 [ 40%] (Warmup)
## Chain 3 Iteration: 300 / 1000 [ 30%] (Warmup)
## Chain 2 Iteration: 400 / 1000 [ 40%] (Warmup)
## Chain 4 Iteration: 500 / 1000 [ 50%] (Warmup)
## Chain 4 Iteration: 501 / 1000 [ 50%] (Sampling)
## Chain 1 Iteration: 500 / 1000 [ 50%] (Warmup)
## Chain 1 Iteration: 501 / 1000 [ 50%] (Sampling)
## Chain 2 Iteration: 500 / 1000 [ 50%] (Warmup)
## Chain 2 Iteration: 501 / 1000 [ 50%] (Sampling)
## Chain 3 Iteration: 400 / 1000 [ 40%] (Warmup)
## Chain 4 Iteration: 600 / 1000 [ 60%] (Sampling)
## Chain 2 Iteration: 600 / 1000 [ 60%] (Sampling)
## Chain 1 Iteration: 600 / 1000 [ 60%] (Sampling)
## Chain 2 Iteration: 700 / 1000 [ 70%] (Sampling)
## Chain 3 Iteration: 500 / 1000 [ 50%] (Warmup)
## Chain 3 Iteration: 501 / 1000 [ 50%] (Sampling)
## Chain 4 Iteration: 700 / 1000 [ 70%] (Sampling)
## Chain 2 Iteration: 800 / 1000 [ 80%] (Sampling)
## Chain 1 Iteration: 700 / 1000 [ 70%] (Sampling)
## Chain 2 Iteration: 900 / 1000 [ 90%] (Sampling)
## Chain 3 Iteration: 600 / 1000 [ 60%] (Sampling)
## Chain 4 Iteration: 800 / 1000 [ 80%] (Sampling)
## Chain 2 Iteration: 1000 / 1000 [100%] (Sampling)
## Chain 2 finished in 444.7 seconds.
## Chain 1 Iteration: 800 / 1000 [ 80%] (Sampling)
## Chain 3 Iteration: 700 / 1000 [ 70%] (Sampling)
## Chain 4 Iteration: 900 / 1000 [ 90%] (Sampling)
## Chain 1 Iteration: 900 / 1000 [ 90%] (Sampling)
## Chain 3 Iteration: 800 / 1000 [ 80%] (Sampling)
## Chain 4 Iteration: 1000 / 1000 [100%] (Sampling)
## Chain 4 finished in 481.3 seconds.
## Chain 1 Iteration: 1000 / 1000 [100%] (Sampling)
## Chain 1 finished in 489.3 seconds.
## Chain 3 Iteration: 900 / 1000 [ 90%] (Sampling)
## Chain 3 Iteration: 1000 / 1000 [100%] (Sampling)
## Chain 3 finished in 513.6 seconds.
##
## All 4 chains finished successfully.
## Mean chain execution time: 482.2 seconds.
## Total execution time: 514.5 seconds.
pnbd_extdata_hiermeans_stanfit$summary()## # A tibble: 14,155 × 10
## variable mean median sd mad q5 q95 rhat ess_bulk
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 lp__ -5.39e+4 -5.39e+4 7.29e+1 7.29e+1 -5.40e+4 -5.37e+4 1.01 755.
## 2 lambda_mn 9.80e-2 9.80e-2 1.46e-3 1.40e-3 9.56e-2 1.01e-1 1.02 191.
## 3 mu_mn 1.07e-2 1.07e-2 3.37e-4 3.21e-4 1.02e-2 1.13e-2 1.47 7.89
## 4 lambda[1] 1.52e-1 1.48e-1 4.21e-2 4.02e-2 9.12e-2 2.29e-1 1.00 3030.
## 5 lambda[2] 9.44e-2 8.65e-2 4.80e-2 4.52e-2 3.05e-2 1.82e-1 0.999 2358.
## 6 lambda[3] 8.72e-2 7.79e-2 4.49e-2 4.30e-2 2.96e-2 1.73e-1 1.00 2846.
## 7 lambda[4] 6.35e-2 5.66e-2 3.53e-2 3.13e-2 1.88e-2 1.27e-1 1.00 3260.
## 8 lambda[5] 6.97e-2 5.74e-2 5.24e-2 4.44e-2 1.04e-2 1.75e-1 1.00 2959.
## 9 lambda[6] 5.63e-2 4.55e-2 4.37e-2 3.61e-2 8.50e-3 1.43e-1 1.00 2649.
## 10 lambda[7] 2.05e-1 1.96e-1 7.35e-2 7.19e-2 9.70e-2 3.37e-1 1.00 2989.
## # … with 14,145 more rows, and 1 more variable: ess_tail <dbl>
We have some basic HMC-based validity statistics we can check.
pnbd_extdata_hiermeans_stanfit$cmdstan_diagnose()## Processing csv files: /home/rstudio/workshop/stan_models/fit_pnbd_extdata_hiermeans-1.csvWarning: non-fatal error reading adaptation data
## , /home/rstudio/workshop/stan_models/fit_pnbd_extdata_hiermeans-2.csvWarning: non-fatal error reading adaptation data
## , /home/rstudio/workshop/stan_models/fit_pnbd_extdata_hiermeans-3.csvWarning: non-fatal error reading adaptation data
## , /home/rstudio/workshop/stan_models/fit_pnbd_extdata_hiermeans-4.csvWarning: non-fatal error reading adaptation data
##
##
## Checking sampler transitions treedepth.
## Treedepth satisfactory for all transitions.
##
## Checking sampler transitions for divergences.
## No divergent transitions found.
##
## Checking E-BFMI - sampler transitions HMC potential energy.
## E-BFMI satisfactory.
##
## Effective sample size satisfactory.
##
## The following parameters had split R-hat greater than 1.05:
## mu_mn, beta
## Such high values indicate incomplete mixing and biased estimation.
## You should consider regularizating your model with additional prior information or a more effective parameterization.
##
## Processing complete.
pnbd_extdata_hiermeans_stanfit$draws(inc_warmup = FALSE) %>%
mcmc_trace(pars = c("lambda_mn", "mu_mn", "alpha", "beta")) +
expand_limits(y = 0) +
labs(
x = "Iteration",
y = "Value",
title = "Traceplot of the Model Hierarchical Parameters"
) +
theme(axis.text.x = element_text(size = 10))6 Write Data to Disk
We now write the data used to fit our models to disk.
customer_transactions_tbl %>% write_rds("data/customer_transactions_tbl.rds")
btyd_fitdata_tbl %>% write_rds("data/btyd_fitdata_tbl.rds")
btyd_obs_stats_tbl %>% write_rds("data/btyd_obs_stats_tbl.rds")
extreme_customers_tbl %>% write_rds("data/extreme_customers_tbl.rds")7 R Environment
options(width = 120L)
sessioninfo::session_info()## ─ Session info ───────────────────────────────────────────────────────────────────────────────────────────────────────
## setting value
## version R version 4.2.0 (2022-04-22)
## os Ubuntu 20.04.5 LTS
## system x86_64, linux-gnu
## ui RStudio
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz Etc/UTC
## date 2022-10-27
## rstudio 2022.02.3+492 Prairie Trillium (server)
## pandoc 2.17.1.1 @ /usr/lib/rstudio-server/bin/quarto/bin/ (via rmarkdown)
##
## ─ Packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## abind 1.4-5 2016-07-21 [1] RSPM (R 4.2.0)
## arrayhelpers 1.1-0 2020-02-04 [1] RSPM (R 4.2.0)
## assertthat 0.2.1 2019-03-21 [1] RSPM (R 4.2.0)
## backports 1.4.1 2021-12-13 [1] RSPM (R 4.2.0)
## base64enc 0.1-3 2015-07-28 [1] RSPM (R 4.2.0)
## bayesplot * 1.9.0 2022-03-10 [1] RSPM (R 4.2.0)
## bit 4.0.4 2020-08-04 [1] RSPM (R 4.2.0)
## bit64 4.0.5 2020-08-30 [1] RSPM (R 4.2.0)
## bookdown 0.27 2022-06-14 [1] RSPM (R 4.2.0)
## boot 1.3-28 2021-05-03 [2] CRAN (R 4.2.0)
## bridgesampling 1.1-2 2021-04-16 [1] RSPM (R 4.2.0)
## brms * 2.17.0 2022-09-26 [1] Github (paul-buerkner/brms@a43937c)
## Brobdingnag 1.2-7 2022-02-03 [1] RSPM (R 4.2.0)
## broom 0.8.0 2022-04-13 [1] RSPM (R 4.2.0)
## bslib 0.3.1 2021-10-06 [1] RSPM (R 4.2.0)
## cachem 1.0.6 2021-08-19 [1] RSPM (R 4.2.0)
## callr 3.7.0 2021-04-20 [1] RSPM (R 4.2.0)
## cellranger 1.1.0 2016-07-27 [1] RSPM (R 4.2.0)
## checkmate 2.1.0 2022-04-21 [1] RSPM (R 4.2.0)
## cli 3.3.0 2022-04-25 [1] RSPM (R 4.2.0)
## cmdstanr * 0.5.3 2022-09-26 [1] Github (stan-dev/cmdstanr@22b391e)
## coda 0.19-4 2020-09-30 [1] RSPM (R 4.2.0)
## codetools 0.2-18 2020-11-04 [2] CRAN (R 4.2.0)
## colorspace 2.0-3 2022-02-21 [1] RSPM (R 4.2.0)
## colourpicker 1.1.1 2021-10-04 [1] RSPM (R 4.2.0)
## conflicted * 1.1.0 2021-11-26 [1] RSPM (R 4.2.0)
## cowplot * 1.1.1 2020-12-30 [1] RSPM (R 4.2.0)
## crayon 1.5.1 2022-03-26 [1] RSPM (R 4.2.0)
## crosstalk 1.2.0 2021-11-04 [1] RSPM (R 4.2.0)
## curl 4.3.2 2021-06-23 [1] RSPM (R 4.2.0)
## DBI 1.1.3 2022-06-18 [1] RSPM (R 4.2.0)
## dbplyr 2.2.0 2022-06-05 [1] RSPM (R 4.2.0)
## digest 0.6.29 2021-12-01 [1] RSPM (R 4.2.0)
## directlabels * 2021.1.13 2021-01-16 [1] RSPM (R 4.2.0)
## distributional 0.3.0 2022-01-05 [1] RSPM (R 4.2.0)
## dplyr * 1.0.9 2022-04-28 [1] RSPM (R 4.2.0)
## DT 0.23 2022-05-10 [1] RSPM (R 4.2.0)
## dygraphs 1.1.1.6 2018-07-11 [1] RSPM (R 4.2.0)
## ellipsis 0.3.2 2021-04-29 [1] RSPM (R 4.2.0)
## evaluate 0.15 2022-02-18 [1] RSPM (R 4.2.0)
## fansi 1.0.3 2022-03-24 [1] RSPM (R 4.2.0)
## farver 2.1.0 2021-02-28 [1] RSPM (R 4.2.0)
## fastmap 1.1.0 2021-01-25 [1] RSPM (R 4.2.0)
## forcats * 0.5.1 2021-01-27 [1] RSPM (R 4.2.0)
## fs * 1.5.2 2021-12-08 [1] RSPM (R 4.2.0)
## furrr * 0.3.0 2022-05-04 [1] RSPM (R 4.2.0)
## future * 1.26.1 2022-05-27 [1] RSPM (R 4.2.0)
## gamm4 0.2-6 2020-04-03 [1] RSPM (R 4.2.0)
## generics 0.1.2 2022-01-31 [1] RSPM (R 4.2.0)
## ggdist 3.1.1 2022-02-27 [1] RSPM (R 4.2.0)
## ggplot2 * 3.3.6 2022-05-03 [1] RSPM (R 4.2.0)
## ggridges 0.5.3 2021-01-08 [1] RSPM (R 4.2.0)
## globals 0.15.0 2022-05-09 [1] RSPM (R 4.2.0)
## glue * 1.6.2 2022-02-24 [1] RSPM (R 4.2.0)
## gridExtra 2.3 2017-09-09 [1] RSPM (R 4.2.0)
## gtable 0.3.0 2019-03-25 [1] RSPM (R 4.2.0)
## gtools 3.9.2.2 2022-06-13 [1] RSPM (R 4.2.0)
## haven 2.5.0 2022-04-15 [1] RSPM (R 4.2.0)
## highr 0.9 2021-04-16 [1] RSPM (R 4.2.0)
## hms 1.1.1 2021-09-26 [1] RSPM (R 4.2.0)
## htmltools 0.5.2 2021-08-25 [1] RSPM (R 4.2.0)
## htmlwidgets 1.5.4 2021-09-08 [1] RSPM (R 4.2.0)
## httpuv 1.6.5 2022-01-05 [1] RSPM (R 4.2.0)
## httr 1.4.3 2022-05-04 [1] RSPM (R 4.2.0)
## igraph 1.3.2 2022-06-13 [1] RSPM (R 4.2.0)
## inline 0.3.19 2021-05-31 [1] RSPM (R 4.2.0)
## jquerylib 0.1.4 2021-04-26 [1] RSPM (R 4.2.0)
## jsonlite 1.8.0 2022-02-22 [1] RSPM (R 4.2.0)
## knitr 1.39 2022-04-26 [1] RSPM (R 4.2.0)
## labeling 0.4.2 2020-10-20 [1] RSPM (R 4.2.0)
## later 1.3.0 2021-08-18 [1] RSPM (R 4.2.0)
## lattice 0.20-45 2021-09-22 [2] CRAN (R 4.2.0)
## lifecycle 1.0.1 2021-09-24 [1] RSPM (R 4.2.0)
## listenv 0.8.0 2019-12-05 [1] RSPM (R 4.2.0)
## lme4 1.1-29 2022-04-07 [1] RSPM (R 4.2.0)
## loo 2.5.1 2022-03-24 [1] RSPM (R 4.2.0)
## lubridate 1.8.0 2021-10-07 [1] RSPM (R 4.2.0)
## magrittr * 2.0.3 2022-03-30 [1] RSPM (R 4.2.0)
## markdown 1.1 2019-08-07 [1] RSPM (R 4.2.0)
## MASS 7.3-56 2022-03-23 [2] CRAN (R 4.2.0)
## Matrix 1.4-1 2022-03-23 [2] CRAN (R 4.2.0)
## matrixStats 0.62.0 2022-04-19 [1] RSPM (R 4.2.0)
## memoise 2.0.1 2021-11-26 [1] RSPM (R 4.2.0)
## mgcv 1.8-40 2022-03-29 [2] CRAN (R 4.2.0)
## mime 0.12 2021-09-28 [1] RSPM (R 4.2.0)
## miniUI 0.1.1.1 2018-05-18 [1] RSPM (R 4.2.0)
## minqa 1.2.4 2014-10-09 [1] RSPM (R 4.2.0)
## modelr 0.1.8 2020-05-19 [1] RSPM (R 4.2.0)
## munsell 0.5.0 2018-06-12 [1] RSPM (R 4.2.0)
## mvtnorm 1.1-3 2021-10-08 [1] RSPM (R 4.2.0)
## nlme 3.1-157 2022-03-25 [2] CRAN (R 4.2.0)
## nloptr 2.0.3 2022-05-26 [1] RSPM (R 4.2.0)
## parallelly 1.32.0 2022-06-07 [1] RSPM (R 4.2.0)
## pillar 1.7.0 2022-02-01 [1] RSPM (R 4.2.0)
## pkgbuild 1.3.1 2021-12-20 [1] RSPM (R 4.2.0)
## pkgconfig 2.0.3 2019-09-22 [1] RSPM (R 4.2.0)
## plyr 1.8.7 2022-03-24 [1] RSPM (R 4.2.0)
## posterior * 1.2.2 2022-06-09 [1] RSPM (R 4.2.0)
## prettyunits 1.1.1 2020-01-24 [1] RSPM (R 4.2.0)
## processx 3.6.1 2022-06-17 [1] RSPM (R 4.2.0)
## projpred 2.1.2 2022-05-13 [1] RSPM (R 4.2.0)
## promises 1.2.0.1 2021-02-11 [1] RSPM (R 4.2.0)
## ps 1.7.1 2022-06-18 [1] RSPM (R 4.2.0)
## purrr * 0.3.4 2020-04-17 [1] RSPM (R 4.2.0)
## quadprog 1.5-8 2019-11-20 [1] RSPM (R 4.2.0)
## R6 2.5.1 2021-08-19 [1] RSPM (R 4.2.0)
## Rcpp * 1.0.8.3 2022-03-17 [1] RSPM (R 4.2.0)
## RcppParallel 5.1.5 2022-01-05 [1] RSPM (R 4.2.0)
## readr * 2.1.2 2022-01-30 [1] RSPM (R 4.2.0)
## readxl 1.4.0 2022-03-28 [1] RSPM (R 4.2.0)
## reprex 2.0.1 2021-08-05 [1] RSPM (R 4.2.0)
## reshape2 1.4.4 2020-04-09 [1] RSPM (R 4.2.0)
## rlang * 1.0.2 2022-03-04 [1] RSPM (R 4.2.0)
## rmarkdown 2.14 2022-04-25 [1] RSPM (R 4.2.0)
## rmdformats 1.0.4 2022-05-17 [1] RSPM (R 4.2.0)
## rstan 2.26.13 2022-09-26 [1] local
## rstantools 2.2.0 2022-04-08 [1] RSPM (R 4.2.0)
## rstudioapi 0.13 2020-11-12 [1] RSPM (R 4.2.0)
## rvest 1.0.2 2021-10-16 [1] RSPM (R 4.2.0)
## sass 0.4.1 2022-03-23 [1] RSPM (R 4.2.0)
## scales * 1.2.0 2022-04-13 [1] RSPM (R 4.2.0)
## sessioninfo 1.2.2 2021-12-06 [1] RSPM (R 4.2.0)
## shiny 1.7.1 2021-10-02 [1] RSPM (R 4.2.0)
## shinyjs 2.1.0 2021-12-23 [1] RSPM (R 4.2.0)
## shinystan 2.6.0 2022-03-03 [1] RSPM (R 4.2.0)
## shinythemes 1.2.0 2021-01-25 [1] RSPM (R 4.2.0)
## StanHeaders 2.26.13 2022-09-26 [1] local
## stringi 1.7.6 2021-11-29 [1] RSPM (R 4.2.0)
## stringr * 1.4.0 2019-02-10 [1] RSPM (R 4.2.0)
## svUnit 1.0.6 2021-04-19 [1] RSPM (R 4.2.0)
## tensorA 0.36.2 2020-11-19 [1] RSPM (R 4.2.0)
## threejs 0.3.3 2020-01-21 [1] RSPM (R 4.2.0)
## tibble * 3.1.7 2022-05-03 [1] RSPM (R 4.2.0)
## tidybayes * 3.0.2.9000 2022-09-26 [1] Github (mjskay/tidybayes@1efbdef)
## tidyr * 1.2.0 2022-02-01 [1] RSPM (R 4.2.0)
## tidyselect 1.1.2 2022-02-21 [1] RSPM (R 4.2.0)
## tidyverse * 1.3.1 2021-04-15 [1] RSPM (R 4.2.0)
## tzdb 0.3.0 2022-03-28 [1] RSPM (R 4.2.0)
## utf8 1.2.2 2021-07-24 [1] RSPM (R 4.2.0)
## V8 4.2.0 2022-05-14 [1] RSPM (R 4.2.0)
## vctrs 0.4.1 2022-04-13 [1] RSPM (R 4.2.0)
## vroom 1.5.7 2021-11-30 [1] RSPM (R 4.2.0)
## withr 2.5.0 2022-03-03 [1] RSPM (R 4.2.0)
## xfun 0.31 2022-05-10 [1] RSPM (R 4.2.0)
## xml2 1.3.3 2021-11-30 [1] RSPM (R 4.2.0)
## xtable 1.8-4 2019-04-21 [1] RSPM (R 4.2.0)
## xts 0.12.1 2020-09-09 [1] RSPM (R 4.2.0)
## yaml 2.3.5 2022-02-21 [1] RSPM (R 4.2.0)
## zoo 1.8-10 2022-04-15 [1] RSPM (R 4.2.0)
##
## [1] /usr/local/lib/R/site-library
## [2] /usr/local/lib/R/library
##
## ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
options(width = 80L)